In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

%matplotlib inline

plt.style.use('seaborn-deep')
plt.rcParams['figure.figsize'] = (12,8)

In [2]:
df_train = pd.read_csv('train.tsv', sep='\t', index_col=0)
df_test = pd.read_csv('test_nolabel.tsv', sep='\t', index_col=0)

In [3]:
df_train.head()

Unnamed: 0,category_id,city,date_created,delivery_available,desc_text,img_num,lat,long,name_text,owner_id,payment_available,price,product_id,product_type,properties,region,sold_mode,subcategory_id,sold_fast
1,4,Краснодар,2018-10-08,False,"Продаю стол раскладной, деревянный, советский ...",3,45.0686,38.9518,Стол,4ce583fe8231a0cc4a3c7d241c7d0289,True,500.0,8cb80c05c65c210275f5500779d6b593,1,"[{'slug_id': 'stoly_stulya_tip', 'slug_name': ...",Краснодарский край,1,410,1
2,4,Тюмень,2018-06-18,False,"Тарелки глубокие 6 шт. Блюдца, чашки по 6 шт. ...",2,57.184,65.5674,Посуда,e58be2c8f143c17246dc2243b5d3b98f,False,300.0,3b7a9f8b27a53b63525f95bc8070abb2,1,"[{'slug_id': 'dom_dacha_posuda_tip', 'slug_nam...",Тюменская область,1,405,0
4,9,Омск,2018-07-31,True,"Новый,с этикеткой. Размер L. Не подошёл по раз...",1,54.9889,73.4312,Костюм,51b408796027214232532b7e478e2159,True,1100.0,c97dd9c5a3e938c52cf5d7822bc0eb7b,1,[{'slug_id': 'zhenskaya_odezhda_pidzhaki_kosty...,Омская область,1,908,0
6,3,Санкт-Петербург,2018-04-17,False,"Складывается тростью, все колеса вниз. Сплошна...",4,59.959,30.4877,Коляска,6544b83acbbf04439a7ba983093cafb4,True,5000.0,3e5d0286b25fd7f62f88bc436a59ae4e,1,"[{'slug_id': 'waggon_type', 'slug_name': 'Тип'...",Ленинградская область,1,312,0
10,5,Москва,2018-02-09,False,"Неразлучники, птичкам по 1,5 года. Продаю с бо...",2,55.6473,37.4118,Волнистые попугаи,ea575e28daf1f47bfce63015cd3ce5cf,True,2000.0,57b4a8679d0d3eb1e31367b57221098f,1,[],Московская область,1,504,0


In [4]:
X = df_train.drop(['sold_fast', 'properties', 'product_id', 'owner_id'], axis=1)
y = df_train['sold_fast']

In [9]:
def cat2proba_dict(values, labels):
    cat_dict = {}
    probas = []
    for aval in np.unique(values):
        labels_for_val = labels[values == aval]
        n = len(labels_for_val)
        n_1 = len(labels_for_val[labels_for_val == 1])
        p = 1.0 * n_1 / n
        cat_dict[aval] = p
    return cat_dict

def cat2proba(cat_dict, values):
    probas = []
    for aval in values:
        if aval in cat_dict:
            probas.append(cat_dict[aval])
        else:
            probas.append(0)
    return np.array(probas)
def date2ymd(date):
    date_plitted = date.split('-')
    year = int(date_plitted[0])
    month = int(date_plitted[1])
    day = int(date_plitted[2])
    return [year, month, day]

In [7]:
category_id_dict = cat2proba_dict(df_train['category_id'].values, y)
subcategory_id_dict = cat2proba_dict(df_train['subcategory_id'].values, y)

city_dict = cat2proba_dict(df_train['city'].values, y)
region_dict = cat2proba_dict(df_train['region'].values, y)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

name_text = df_train['name_text'].values
vectorizer_name_text = TfidfVectorizer(max_features=100, decode_error='ignore')
vectorizer_name_text.fit(name_text)

desc_text = df_train['desc_text'].values
vectorizer_desc_text = TfidfVectorizer(max_features=100, decode_error='ignore')
vectorizer_desc_text.fit(desc_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [28]:
def preprocessing(data):
    
    x1 = data[['category_id', 'lat', 'long', 'price', 'product_type', 'sold_mode', 'subcategory_id', 'img_num']].values
    feature_raw = data['date_created'].values
    x2 = np.array([date2ymd(i) for i in feature_raw])
    x3 = data[['payment_available', 'delivery_available']].values * 1.
    
    x10 = cat2proba(category_id_dict, data['category_id'].values).reshape(-1, 1)
    x11 = cat2proba(subcategory_id_dict, data['subcategory_id'].values).reshape(-1, 1)
    x12 = cat2proba(city_dict, data['city'].values).reshape(-1, 1)
    x13 = cat2proba(region_dict, data['region'].values).reshape(-1, 1)
    
    name_text = data['name_text'].values
    x1000 = vectorizer_name_text.transform(name_text).toarray()
#     print(x1000.shape)
    desc_text = data['desc_text'].values
    x1001 = vectorizer_desc_text.transform(desc_text).toarray()
    
    X = np.concatenate(tuple([x1, x2, x3, x10, x11, x12, x13, x1000, x1001]), axis=1)
    
    return X

In [29]:
X_train = preprocessing(X)

(351281, 100)


In [15]:
X_train.shape

(351281, 217)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.33, random_state=42)

In [20]:
from xgboost import XGBClassifier

#clf = RandomForestClassifier(n_estimators=100, n_jobs=3, class_weight='balanced', min_samples_leaf=100)
clf = XGBClassifier(n_estimators=200, learning_rate=0.02, max_depth=8, n_jobs=3, colsample_bytree=0.7, scale_pos_weight=1.)
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0,
       learning_rate=0.02, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=3,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1.0, seed=None,
       silent=None, subsample=1, verbosity=1)

In [21]:
y_val_pred = clf.predict_proba(X_val)[:, 1]

In [23]:
from sklearn.metrics import roc_auc_score

auc_val = roc_auc_score(y_val, y_val_pred)

print("Test AUC: ", auc_val)

Test AUC:  0.6462071073189568


In [24]:
data_submit = pd.read_csv('test_nolabel.tsv', sep = '\t')

In [25]:
X_submit = preprocessing(data_submit)

In [26]:
y_submit_pred = clf.predict_proba(X_submit)[:, 1]

In [27]:
product_id = data_submit['product_id'].values
data_submit = pd.DataFrame.from_dict({'product_id' : product_id, 'score' : y_submit_pred})
data_submit.to_csv('./to_submit', sep = ',', index = False)