In [2]:
import requests
import pandas as pd
import re
from pymongo import MongoClient
from concurrent.futures import ProcessPoolExecutor
from time import sleep
import os
import json
import glob         

In [2]:
df = pd.read_csv('Products ID.csv')
ids = df['id'].tolist()

In [3]:
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
keys = ['id','url_key','price','description','images']

In [14]:
def clean_description(text):
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'&nbsp;', '', text)
    text = re.sub(r'\xa0', '', text)
    text = re.sub(r'&amp;', '&', text)
    text = text.strip()
    return text

In [5]:
id_batch = [ids[i:i+100] for i in range(0, len(ids), 100)]

In [7]:
def get_data(id):
    output_holder = []
    error_holder = []
    url = f'https://api.tiki.vn/product-detail/api/v1/products/{id}'
    response = requests.get(url, headers=header)
    try:
        data = response.json()
        data['description'] = clean_description(data['description'])
        output = {k: data[k] for k in keys}
        output_holder.append(output)
        print(f'ID: {id} - Done')
    except Exception as e:
        print(f'Error: {e}')
        error_holder.append(id)
    return output_holder, error_holder


In [None]:

for id_list in id_batch[]:
    result = []
    errors = []
    with ProcessPoolExecutor() as executor:
        for data, error in list(executor.map(get_data, id_list)):
            result.extend(data)
            errors.extend(error)
    for file in glob.glob('*.json'):
        with open(file) as f:
            data = f.read()
            json_objects = json.loads(result)
            if os.path.exists('final.json'):
                with open('final.json', 'r') as f:
                    try:
                        final_data = json.load(f)
                    except json.JSONDecodeError:
                        final_data = []
                    final_data.extend(json_objects)
                with open('final.json', 'w') as f:                      
                    json.dump(final_data, f, indent=4)
            else:    
                with open('final.json', 'w') as f:
                    json.dump(json_objects, f, indent=4)
    print(f'Batch {id_batch.index(id_list)} - Done')
    sleep(2)

In [20]:
uri = 'mongodb://localhost:27017'
client = MongoClient(uri)
database = client['tiki']
collection = database['products']
with open('final.json') as f:
    data = json.load(f)
    collection.insert_many(data)