In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

%matplotlib inline

plt.style.use('seaborn-deep')
plt.rcParams['figure.figsize'] = (12,8)

In [130]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score

from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
from sklearn.pipeline import Pipeline

from itertools import chain

In [1]:
# ! git clone https://github.com/facebookresearch/fastText.git
# % cd fastText
# ! pip install .

In [3]:
import fastText

In [5]:
% cd ..

C:\Users\Natalia\Documents\kaggle_competition


In [6]:
df_train = pd.read_csv('train.tsv', sep='\t', index_col=0)
df_test = pd.read_csv('test_nolabel.tsv', sep='\t', index_col=0)

# Lemmatize descriptions

In [23]:
from string import punctuation
from tqdm import tqdm
import re

In [2]:
# !pip3 install pymorphy2
import pymorphy2

In [67]:
import nltk
# nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('russian')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Natalia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [85]:
def lemmatize(texts):
    stemmer = pymorphy2.MorphAnalyzer()
    lemmatized = []
    for text in tqdm(texts):
        text = [word for word in re.split('\W+', text)]
        tokens = [stemmer.parse(word.strip())[0].normal_form for word in text]
        tokens = list(filter(lambda x: x.strip() not in ['', ' ', '> <', '»', '«', '–'] and x not in punctuation \
                             and x not in stopwords, tokens))
        lemmatized.append(' '.join(tokens))
    return np.array(lemmatized, dtype='object')

In [86]:
#descriptions = np.concatenate((df_train.desc_text.values, df_test.desc_text.values), axis=None)
descriptions = df_train.desc_text.values
descriptions[:2]

array(['Продаю стол раскладной, деревянный, советский на века. Состояние осень хорошее. Торг уместен. Самовывоз ФМР',
       'Тарелки глубокие 6 шт. Блюдца, чашки по 6 шт. Все ранешний фарфор. В другом пакете современные блюдца, чашки и чайник с мишкой Тедди. Ничем практически не пользовались'],
      dtype=object)

In [3]:
lemmatized = lemmatize(descriptions)
# lemmatized[:5]

In [96]:
np.savetxt('lemmatized_train_description.csv', lemmatized, delimiter=',', encoding='utf-8', fmt="%s")

In [4]:
descriptions_test = df_test.desc_text.values

lemmatized_desc_test = lemmatize(descriptions_test)
# lemmatized_desc_test[:5]

In [326]:
lemmatized_desc_test.shape

(89251,)

In [97]:
np.savetxt('lemmatized_test_description.csv', descriptions_test, delimiter=',', encoding='utf-8', fmt="%s")

In [119]:
# df_train.loc[:, 'desc_text'] = lemmatized

# Apply fastText

In [292]:
def applyFastText(texts, target):
    y = target.apply(lambda x: '__label__' + str(x))

    fastText_df = pd.DataFrame(columns=['text', 'label'], index=np.arange(y.shape[0]), \
                           data=np.vstack([texts, y.values]).T)
    fastText_df.to_csv('to_train_ft.txt', index=False, sep=' ', encoding='utf-8')
    
    fastText_model = fastText.train_supervised('to_train_ft.txt', wordNgrams=2)
    
    return fastText_model    
    
def get_fastText_proba(model, X_train):
    y_proba = model.predict(list(X_train), k=2)
    probas_pos = []
    for i in np.arange(len(y_proba[0])):
        label = y_proba[0][i][0]
        if label == '__label__0':
            probas_pos.append(y_proba[1][i][1])
        else:
            probas_pos.append(y_proba[1][i][0])   
    return np.array(probas_pos)

In [293]:
fastText_model = applyFastText(lemmatized, df_train.sold_fast)

In [294]:
ft_probas_pos = get_fastText_proba(fastText_model, lemmatized)

In [296]:
np.array(ft_probas_pos)[:5]

array([0.27031934, 0.51763797, 0.10651568, 0.17362656, 0.08377719])

In [297]:
np.savetxt('TRAIN_fastText_desc_posproba.csv', np.array(ft_probas_pos), delimiter=',', encoding='utf-8')

In [186]:
# y_pred_ft = fastText_model.predict(list(fastText_df.text.values))[0]
# y_pred_ft = list(chain(*y_pred_ft))

In [187]:
# y_pred_ft = list(map(lambda x: x.replace('__label__', ''), y_pred_ft))
# true_labels = list(map(lambda x: x.replace('__label__', ''), fastText_df.label.values))

In [188]:
# print('Accuracy:', accuracy_score(true_labels, np.array(y_pred_ft)))
# print('Fscore:', f1_score(true_labels, np.array(y_pred_ft), average='weighted'))

Accuracy: 0.7889666677104654
Fscore: 0.7458280584999859


In [298]:
from sklearn.metrics import roc_auc_score

auc_val = roc_auc_score(df_train.sold_fast.values, ft_probas_pos)
auc_val

0.7669134248324669

#### Test values

In [299]:
test_ft_probas_pos = get_fastText_proba(fastText_model, lemmatized_desc_test)

In [300]:
np.savetxt('TEST_fastText_desc_posproba.csv', test_ft_probas_pos, delimiter=',', encoding='utf-8')

## Same deal, but for name

In [5]:
names = df_train.name_text.values
lemmatized_names = lemmatize(names)
np.savetxt('lemmatized_train_name.csv', lemmatized_names, delimiter=',', encoding='utf-8', fmt="%s")

In [6]:
test_names = df_test.name_text.values
test_lemmatized_names = lemmatize(test_names)
np.savetxt('lemmatized_test_name.csv', test_lemmatized_names, delimiter=',', encoding='utf-8', fmt="%s")

In [301]:
fastText_model_name = applyFastText(lemmatized_names, df_train.sold_fast)

In [304]:
train_ft_probas_pos_name = get_fastText_proba(fastText_model_name, lemmatized_names) 

In [305]:
np.savetxt('TRAIN_fastText_name_posproba.csv', train_ft_probas_pos_name, delimiter=',', encoding='utf-8')

#### Test values

In [306]:
test_ft_probas_pos_name = get_fastText_proba(fastText_model_name, test_lemmatized_names)

In [307]:
np.savetxt('TEST_fastText_name_posproba.csv', test_ft_probas_pos_name, delimiter=',', encoding='utf-8')

## Now replacing `desc_text` and `name_text` with probas

In [274]:
X = df_train.drop(['sold_fast', 'properties', 'product_id', 'owner_id', 'desc_text', 'name_text'], axis=1)
y = df_train['sold_fast']

In [310]:
desc_proba = pd.read_csv('TRAIN_fastText_desc_posproba.csv', header=None)
name_proba = pd.read_csv('TRAIN_fastText_name_posproba.csv', header=None)
X['desc_prob'] = desc_proba.values
X['name_prob'] = name_proba.values
X.head()

Unnamed: 0,category_id,city,date_created,delivery_available,img_num,lat,long,payment_available,price,product_type,region,sold_mode,subcategory_id,desc_prob,name_prob
1,4,Краснодар,2018-10-08,False,3,45.0686,38.9518,True,500.0,1,Краснодарский край,1,410,0.270319,0.313682
2,4,Тюмень,2018-06-18,False,2,57.184,65.5674,False,300.0,1,Тюменская область,1,405,0.517638,0.442623
4,9,Омск,2018-07-31,True,1,54.9889,73.4312,True,1100.0,1,Омская область,1,908,0.106516,0.151561
6,3,Санкт-Петербург,2018-04-17,False,4,59.959,30.4877,True,5000.0,1,Ленинградская область,1,312,0.173627,0.259099
10,5,Москва,2018-02-09,False,2,55.6473,37.4118,True,2000.0,1,Московская область,1,504,0.083777,0.398533


In [311]:
def cat2proba_dict(values, labels):
    cat_dict = {}
    probas = []
    for aval in np.unique(values):
        labels_for_val = labels[values == aval]
        n = len(labels_for_val)
        n_1 = len(labels_for_val[labels_for_val == 1])
        p = 1.0 * n_1 / n
        cat_dict[aval] = p
    return cat_dict

def cat2proba(cat_dict, values):
    probas = []
    for aval in values:
        if aval in cat_dict:
            probas.append(cat_dict[aval])
        else:
            probas.append(0)
    return np.array(probas)
def date2ymd(date):
    date_plitted = date.split('-')
    year = int(date_plitted[0])
    month = int(date_plitted[1])
    day = int(date_plitted[2])
    return [year, month, day]

In [312]:
category_id_dict = cat2proba_dict(df_train['category_id'].values, y)
subcategory_id_dict = cat2proba_dict(df_train['subcategory_id'].values, y)

city_dict = cat2proba_dict(df_train['city'].values, y)
region_dict = cat2proba_dict(df_train['region'].values, y)

In [313]:
def preprocess(X, category_id_dict, subcategory_id_dict, city_dict, region_dict):
    dates = np.array([date2ymd(date) for date in X.loc[:, 'date_created'].values])
    #year is the same everywhere, getting rid of it
    X.loc[:, 'month'] = dates[:, 1]
    X.loc[:, 'day'] = dates[:, 2]
    X = X.drop(['date_created'], axis=1)
    
    X.loc[:, 'city'] = cat2proba(city_dict, X['city'].values)
    X.loc[:, 'region'] = cat2proba(region_dict, X['region'].values)
    
    X.loc[:, 'category_id'] = cat2proba(category_id_dict, X['category_id'].values)
    X.loc[:, 'subcategory_id'] = cat2proba(category_id_dict, X['subcategory_id'].values)
    
    X.loc[:, 'delivery_available'] = X.loc[:, 'delivery_available'] * 1.
    X.loc[:, 'payment_available'] = X.loc[:, 'payment_available'] * 1.
    
    return X

In [314]:
X = preprocess(X, category_id_dict, subcategory_id_dict, city_dict, region_dict)

In [315]:
X.head()

Unnamed: 0,category_id,city,delivery_available,img_num,lat,long,payment_available,price,product_type,region,sold_mode,subcategory_id,desc_prob,name_prob,month,day
1,0.258829,0.227393,0.0,3,45.0686,38.9518,1.0,500.0,1,0.226545,1,0,0.270319,0.313682,10,8
2,0.258829,0.226342,0.0,2,57.184,65.5674,0.0,300.0,1,0.227446,1,0,0.517638,0.442623,6,18
4,0.162397,0.235826,1.0,1,54.9889,73.4312,1.0,1100.0,1,0.234856,1,0,0.106516,0.151561,7,31
6,0.237816,0.229844,0.0,4,59.959,30.4877,1.0,5000.0,1,0.23102,1,0,0.173627,0.259099,4,17
10,0.238401,0.230473,0.0,2,55.6473,37.4118,1.0,2000.0,1,0.227684,1,0,0.083777,0.398533,2,9


In [331]:
X.shape

(351281, 16)

In [338]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [339]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [340]:
# clf = RandomForestClassifier(n_estimators=500, n_jobs=3, class_weight='balanced', max_depth=4)
clf = XGBClassifier(n_estimators=200, learning_rate=0.02, max_depth=8, n_jobs=3, colsample_bytree=0.7, scale_pos_weight=1.)
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0,
       learning_rate=0.02, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=3,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1.0, seed=None,
       silent=None, subsample=1, verbosity=1)

In [341]:
y_val_pred = clf.predict_proba(X_val)[:, 1]

auc_val = roc_auc_score(y_val, y_val_pred)
print("Test AUC: ", auc_val) # biased test auc

Test AUC:  0.7873901851817262


## Transform test likewise

In [335]:
X_test = df_test.drop(['properties', 'product_id', 'owner_id', 'desc_text', 'name_text'], axis=1)
desc_proba = pd.read_csv('TEST_fastText_desc_posproba.csv', header=None)
name_proba = pd.read_csv('TEST_fastText_name_posproba.csv', header=None)
X_test['desc_prob'] = desc_proba.values
X_test['name_prob'] = name_proba.values
X_test = preprocess(X_test, category_id_dict, subcategory_id_dict, city_dict, region_dict)
X_test.head()

Unnamed: 0,category_id,city,delivery_available,img_num,lat,long,payment_available,price,product_type,region,sold_mode,subcategory_id,desc_prob,name_prob,month,day
0,0.162397,0.229716,1.0,3,55.2639,61.3972,1.0,650.0,1,0.229135,1,0,0.268471,0.172749,10,22
3,0.16987,0.248002,1.0,2,53.3426,83.738,1.0,750.0,1,0.242036,1,0,0.057047,0.125106,11,9
8,0.293339,0.250707,0.0,4,52.0883,113.4982,1.0,5500.0,1,0.250206,1,0,0.231807,0.184615,11,27
9,0.237816,0.230473,1.0,2,55.6932,37.5684,1.0,100.0,1,0.227684,1,0,0.123092,0.070957,11,25
19,0.16987,0.215649,1.0,2,54.7648,83.0884,1.0,700.0,1,0.232747,1,0,0.316553,0.212875,11,4


In [336]:
y_submit_pred = clf.predict_proba(X_test)[:, 1]

product_id = df_test['product_id'].values
data_submit = pd.DataFrame.from_dict({'product_id' : product_id, 'score' : y_submit_pred})
data_submit.to_csv('./to_submit.csv', sep = ',', index = False)

In [337]:
y_submit_pred.shape

(89251,)

Scores 0.58 on the leaderboard :(