In [None]:
import pandas as pd
import glob
import os

In [None]:
name = 'fashion'

In [None]:
all_files = glob.glob(os.path.join(f"../../data/raw_data/reviews/{name}", "*.csv"))

reviews_df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [None]:
reviews_df

In [None]:
products_df = pd.read_csv(f'../../data/product_{name}.csv', encoding='utf8')
products_df

In [None]:
df = products_df.merge(reviews_df, 'inner', left_on=['product_id', 'shop_id'], right_on=['product_id', 'shop_id'])

df

In [None]:
pd.options.mode.copy_on_write = True

In [None]:
df = df[~df.duplicated()]
df.dropna(inplace=True)

df

In [None]:
df[:10]

In [None]:
import re
import numpy as np
from tqdm import tqdm

dict_map = {
    "òa": "oà",
    "Òa": "Oà",
    "ÒA": "OÀ",
    "óa": "oá",
    "Óa": "Oá",
    "ÓA": "OÁ",
    "ỏa": "oả",
    "Ỏa": "Oả",
    "ỎA": "OẢ",
    "õa": "oã",
    "Õa": "Oã",
    "ÕA": "OÃ",
    "ọa": "oạ",
    "Ọa": "Oạ",
    "ỌA": "OẠ",
    "òe": "oè",
    "Òe": "Oè",
    "ÒE": "OÈ",
    "óe": "oé",
    "Óe": "Oé",
    "ÓE": "OÉ",
    "ỏe": "oẻ",
    "Ỏe": "Oẻ",
    "ỎE": "OẺ",
    "õe": "oẽ",
    "Õe": "Oẽ",
    "ÕE": "OẼ",
    "ọe": "oẹ",
    "Ọe": "Oẹ",
    "ỌE": "OẸ",
    "ùy": "uỳ",
    "Ùy": "Uỳ",
    "ÙY": "UỲ",
    "úy": "uý",
    "Úy": "Uý",
    "ÚY": "UÝ",
    "ủy": "uỷ",
    "Ủy": "Uỷ",
    "ỦY": "UỶ",
    "ũy": "uỹ",
    "Ũy": "Uỹ",
    "ŨY": "UỸ",
    "ụy": "uỵ",
    "Ụy": "Uỵ",
    "ỤY": "UỴ",
    }

def replace_all(text):
    text = str(text)
    for i, j in dict_map.items():
        text = text.replace(i, j)
    return text


EMOJI_PATTERN = re.compile(
    "["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251" 
    "]+"
)

chunk_count = 1000000
num_of_saves = 50
file_count = 0

for i in tqdm(range(0, df.shape[0], chunk_count)):
    chunk = df.loc[i:i+chunk_count]

    # remove emojis
    chunk['description'] = chunk['description'].str.replace(EMOJI_PATTERN, '', regex=True)


    # remove consecutive blanks
    chunk['description'] = chunk['description'].str.replace(r'\s+', ' ', regex=True)

    # remove consecutive new lines
    chunk['description'] = chunk['description'].str.replace(r'\n', ' ', regex=True)
    chunk['description'] = chunk['description'].apply(lambda x: x.strip())


    # remove hashtags
    chunk['description'] = chunk['description'].str.replace("(#\w+\s*)+", ' ', regex=True)

    # normalize unicode
    chunk['description'] = chunk['description'].str.normalize('NFKD')

    # normalize vietnamese tone
    chunk['description'] = chunk['description'].map(replace_all)
    
    chunk['comment'] = chunk['comment'].str.replace(EMOJI_PATTERN, '', regex=True)
    chunk['comment'] = chunk['comment'].str.replace(r'\s+', ' ', regex=True)
    chunk['comment'] = chunk['comment'].apply(lambda x: x.strip())
    chunk['comment'] = chunk['comment'].str.replace(r'\n', ' ', regex=True)
    chunk['comment'] = chunk['comment'].str.replace("(#\w+\s*)+", ' ', regex=True)
    chunk['comment'] = chunk['comment'].str.normalize('NFKD')
    chunk['comment'] = chunk['comment'].map(replace_all)
    
    chunk['product_id'] = chunk['product_id'].astype(str)
    chunk['shop_id'] = chunk['shop_id'].astype(str)
    
    chunk['length'] = chunk['description'].str.count(' ') + df['comment'].str.count(' ') + 2
    chunk['length'] = chunk['length'].astype(int)
    
    chunk_length_less_512 = chunk[chunk['length'] <= 512]
    
    reviews_count_chunk = chunk_length_less_512.groupby(['product_id', 'shop_id'])['comment'].count().reset_index(name='count').sort_values(['count'], ascending=False)
    more_than_20_less_than_50_ids = reviews_count_chunk[(reviews_count_chunk['count'] >= 20) & (reviews_count_chunk['count'] <= 50)]
    more_than_20_less_than_50 = chunk_length_less_512.merge(more_than_20_less_than_50_ids, 'inner', left_on=['product_id', 'shop_id'], right_on=['product_id', 'shop_id'])
    
    for chunk in np.array_split(more_than_20_less_than_50, num_of_saves):
        chunk.to_csv(f'./data/merge/{name}/{name}_chunk_{file_count}.csv', encoding='utf-8-sig', index=False)
        file_count += 1

In [None]:
more_than_20_less_than_50 = df_length_less_512.merge(more_than_20_less_than_50_ids, 'inner', left_on=['product_id', 'shop_id'], right_on=['product_id', 'shop_id'])
more_than_20_less_than_50