In [1]:
import numpy as np
import pandas as pd

In [2]:
%time data = pd.read_csv('../train.csv')

CPU times: user 20.5 s, sys: 1.45 s, total: 21.9 s
Wall time: 24.6 s


In [3]:
test = pd.read_csv('../test.csv')

In [4]:
samplesub = pd.read_csv('../sample_submission.csv')

<hr>

In [9]:
data = data.sample(frac = 1) # keep index!

In [10]:
def normalize(curr):
    # remove accent
    curr = curr.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    # to lower case
    curr = curr.str.lower()
    # remove not alphanumerics or . ,
    curr = curr.str.replace('[^a-zA-Z0-9.,]', ' ')
    
    # let , and . be the same char
    curr = curr.str.replace('[.]', ',')
    
    # remove . , not between numbers
    curr = curr.str.replace('(?<=[0-9])[,]+(?=[0-9])', '.')
    curr = curr.str.replace('[,]', ' ')
    
    # set all digits to 0
    curr = curr.str.replace('[0-9]', '0')
    
    # separate ' <digits><letters ' like in 22g or 12ms
    # curr = curr.str.replace('(^| )([0-9]+)([a-zA-Z]+)($| )', r'\1\2 \3\4')
    
    # remove some Pt plurals
    curr = curr.str.replace('([a-zA-Z]+[aeiou])(s)', r'\1')
    
    # Other ideas: 
    
    return curr

In [11]:
X_data = data.title
test_data = test.title

In [12]:
X_full = pd.concat([X_data, test_data])

In [13]:
%%time 
X_full = normalize(X_full)

CPU times: user 3min 26s, sys: 2.25 s, total: 3min 29s
Wall time: 3min 23s


In [14]:
sp2 = len(data)

In [15]:
%%time

# remove tokens with numbers
X_full = X_full.str.replace('[a-z0-9_]*[0-9][a-z0-9_]*', ' ')
X_norm = X_full[:sp2]
test_norm = X_full[sp2:]

CPU times: user 1min, sys: 71.8 ms, total: 1min
Wall time: 1min


In [16]:
%%time 
# train_words = X_norm.str.split(expand=True).stack().value_counts().to_dict()
train_words = dict()
for t in X_norm:
    for w in t.split():
        train_words[w] = train_words.get(w, 0) + 1

CPU times: user 33.2 s, sys: 12 ms, total: 33.2 s
Wall time: 33.2 s


In [17]:
%%time 
# test_words = test_norm.str.split(expand=True).stack().value_counts().to_dict()
test_words = dict()
for t in test_norm:
    for w in t.split():
        test_words[w] = test_words.get(w, 0) + 1

CPU times: user 387 ms, sys: 35 µs, total: 387 ms
Wall time: 387 ms


In [18]:
len(test_words), len(train_words)

(65398, 686965)

In [19]:
testwseries = pd.Series(test_words).index

In [20]:
temp_test = testwseries.isin(pd.Series(train_words).index)

In [21]:
temp_test.mean()

0.9534389430869445

In [22]:
testwseries[~temp_test].values[-200:]

array(['quianyu', 'renauklt', 'graaan', 'whiteni', 'boguilla', 'batalh',
       'zaragaton', 'platozote', 'cracion', 'abcdefghijlmnopqrstuvxz',
       'avgb', 'turimovr', 'fzlante', 'almohadapileta', 'cabf', 'fltdlc',
       'civemaq', 'autoignicao', 'teleprint', 'supeemercado', 'alyo',
       'estactora', 'megabits', 'gmcb', 'litewind', 'ecolaminator',
       'sainthealth', 'profesionane', 'treab', 'bowo', 'nanoanillo',
       'vitamaca', 'fotolopolimerizador', 'francobolli', 'paei',
       'progyro', 'vehen', 'unicaerv', 'daguer', 'tornazo',
       'portaacrilico', 'barrafix', 'omgear', 'maxjuli', 'lacrex',
       'dynamicut', 'drtagon', 'noboru', 'capacipativa', 'cardiocom',
       'protrectore', 'duplivox', 'microdin', 'plotteable', 'aeroburne',
       'megafome', 'pecorelli', 'abdefg', 'armatazo', 'rockernapper',
       'cormans', 'swereco', 'hirvonen', 'propicepci', 'pitoporium',
       'lidschatten', 'tocadidco', 'conns', 'danning', 'caeprotetora',
       'compostero', 'premiumk

In [23]:
%%time
## how many instances are necessary (assuming X_norm is shuffled already)
test_counter = test_words.copy()
priorities = np.full(len(X_norm), 999_999)

for i, t in enumerate(X_norm):
    for w in t.split():
        if w in test_counter:
            priorities[i] = min(priorities[i], test_words[w] - test_counter[w])
            test_counter[w] -= 1 # 1 for equal ammount, .5 for twice in train, ... # actually any positive val works


CPU times: user 1min 42s, sys: 35.9 ms, total: 1min 42s
Wall time: 1min 42s


In [24]:
data['priorities'] = priorities

In [25]:
data.sort_values(['category', 'label_quality', 'priorities'], inplace=True)

In [26]:
%time reduced = data.groupby(['category'], as_index=False).head(800)

CPU times: user 1.16 s, sys: 280 ms, total: 1.44 s
Wall time: 1.44 s


In [27]:
%%time 
red_norm = normalize(reduced.title)
red_norm = red_norm.str.replace('[a-z0-9_]*[0-9][a-z0-9_]*', ' ')

CPU times: user 17 s, sys: 16 ms, total: 17 s
Wall time: 17 s


In [28]:
%%time 
red_words = red_norm.str.split(expand=True).stack().value_counts().to_dict()

CPU times: user 8.93 s, sys: 448 ms, total: 9.38 s
Wall time: 8.35 s


In [29]:
temp2_test = testwseries.isin(pd.Series(red_words).index)

In [30]:
temp2_test.mean() # 800 ->937, 1000 -> 94

0.9371387504205022

In [31]:
%%time 
left = data.loc[~data.index.isin(reduced.index)].query('priorities <= 5') # 0 -> 1650, 1,0 -> 32k, 1-5 -> 97k, 1-10 -> 188k

CPU times: user 8.93 s, sys: 596 ms, total: 9.53 s
Wall time: 6.17 s


In [32]:
len(left)

188739

In [33]:
reduced = pd.concat([reduced, left]) # next time do a shuffle after this

In [34]:
%%time 
red_norm = normalize(reduced.title)
red_norm = red_norm.str.replace('[a-z0-9_]*[0-9][a-z0-9_]*', ' ')
red_words = red_norm.str.split(expand=True).stack().value_counts().to_dict()
temp2_test = testwseries.isin(pd.Series(red_words).index)

CPU times: user 31.6 s, sys: 356 ms, total: 32 s
Wall time: 30.2 s


In [35]:
temp2_test.mean()

0.9534389430869445

<hr>

## Data Simplification

In [36]:
from sklearn.preprocessing import LabelEncoder

In [37]:
le = LabelEncoder()

In [38]:
le.fit(data.category)

LabelEncoder()

In [39]:
# must save this for later

le.classes_

array(['3D_GLASSES', '3D_PENS', '3D_PRINTERS', ..., 'YARNS', 'YOGA_MATS',
       'YOGURT_MAKERS'], dtype=object)

In [40]:
# change target to numeric

reduced['category'] = le.transform(reduced.category)

In [41]:
# change label_quality to binary integer

reduced['label_quality'] =  reduced.label_quality.map({'reliable' : 0, 'unreliable' : 1})

In [42]:
# change language to binary integer

reduced['language'] =  reduced.language.map({'spanish' : 0, 'portuguese' : 1})

In [43]:
reduced.head()

Unnamed: 0,title,label_quality,language,category,priorities
6737325,Anteojo Lg 3d Pasivo,0,0,0,79
13643907,Samsung 55 3d-un55f8000. Ultima Semana D Publi...,0,0,0,120
301995,Samsung Lentes 3d Activo Ssg-3300gr Caja Cerr...,0,0,0,230
18914336,Anteojos Activos 3d (x1 Lente) Noblex-jvc-pion...,0,0,0,350
13966351,Sony Lentes 3d Recargables Tdg-br250 !! Oferta...,0,0,0,573


In [44]:
reduced.shape

(1444624, 5)

In [None]:
reduced = reduced.sample(frac = 1)

In [45]:
reduced.to_csv('../data-reduced-800-v2-shuffled.csv', index = True)