In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import json
import malaya
from tqdm import tqdm
from unidecode import unidecode
import random
from glob import glob

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from malaya.text.vectorizer import SkipGramCountVectorizer

stopwords = malaya.text.function.get_stopwords()
bow = CountVectorizer(
    ngram_range = (1, 3),
    stop_words = stopwords,
    lowercase = False,
)

stopwords = malaya.text.function.get_stopwords()
skip_bow = SkipGramCountVectorizer(
    ngram_range = (1, 3),
    stop_words = stopwords,
    lowercase = False,
    skip = 2
)

In [4]:
files = sorted(glob('meta_*.json.filtered.translated'))
len(files)

13

In [5]:
files[0]

'meta_Cell_Phones_and_Accessories.json.filtered.translated'

In [6]:
selected = []
for f in files:
    selected_ = []
    with open(f) as fopen:
        for l in tqdm(fopen):
            data = json.loads(l)
            splitted = data['translated'].split()
            if 20 < len(splitted) < 300 and len(set(splitted)) > 10:
                selected_.append(data)
    selected.extend(random.sample(selected_, min(len(selected_), 40000)))

100046it [00:02, 49158.78it/s]
16041it [00:01, 15423.25it/s]
12299it [00:00, 54132.92it/s]
44679it [00:00, 57923.72it/s]
50632it [00:02, 17477.19it/s]
100000it [00:06, 14285.84it/s]
100000it [00:06, 15188.76it/s]
100000it [00:06, 16384.58it/s]
10813it [00:00, 47476.29it/s]
26790it [00:01, 20997.11it/s]
100000it [00:08, 11836.38it/s]
100000it [00:04, 20350.18it/s]
84822it [00:02, 32959.26it/s]


In [7]:
len(selected)

383303

In [8]:
selected[0]['data']['title']

'EZOPower Battery Charger for Samsung GALAXY S2 / SII I9100 / Galaxy S II SGH-i777 / Exhibit 4G T759'

In [9]:
import re

def simple_cleaning(string):
    return re.sub(r'[ ]+', ' ', unidecode(string).replace('\n', ' ')).strip()

In [10]:
t = selected[0]['translated']
keywords_rake = malaya.keyword.extractive.rake(t,
                                          top_k = random.randint(5, 12))
already = set()
filtered = []
for k in keywords_rake:
    k = k[1]
    if k.lower() in already:
        continue
    else:
        already.add(k.lower())
        filtered.append(k)

filtered 

['<br /> Design compact with collapsible plug for easy storage and portability <br /> Automatik switches to safe mode when full charge is detected to prevent overcharging <br /> Built in short circuit protection <br /> Input',
 'USB Charging Port</b> <br /> Recharge bateri spare directly in wall outlet with this compact EZOPower charger <br /> USB port allows you to charge 2nd device simultaneously',
 '<br /> <br /> <b>Compatible with Samsung Galaxy S2/SII Exhibit SGH-T679/SGH-T759/SGH-i777/i9100 series',
 '<b>EZOPower Samsung GALAXY S2 Battery Wall Charger',
 '2V  350mA <br /> USB Output']

In [11]:
months = {'<li>', '</li>', '</ul>', '<p>', '<b>', '</b>', '<br>', '</br>', '</LI></UL>'}

In [12]:
before, after = [], []
for i in tqdm(range(len(selected))):
    t = selected[i]['translated']
    try:
        keywords_rake = malaya.keyword.extractive.rake(t,
                                                  top_k = random.randint(5, 12))
        
        keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 1 and len(k[1]) > 10 \
                        and '<br' not in k[1] and len(set(k[1].lower().replace('-', '').split()) & months) == 0 \
                        and len(k[1]) < 150 and not any([m in k[1].lower() for m in months])]
        
        already = set()
        filtered = []
        for k in keywords_rake:
            if k.lower() in already:
                continue
            else:
                already.add(k.lower())
                filtered.append(k)
        
        if len(filtered) >= random.randint(2, 4):
            before.append(filtered)
            after.append(t)
    except Exception as e:
        print(e)

100%|██████████████████████████████████| 383303/383303 [14:38<00:00, 436.14it/s]


In [13]:
with open('product-description.json', 'w') as fopen:
    json.dump({'before': before, 'after': after}, fopen)