In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

%matplotlib inline

plt.style.use('seaborn-deep')
plt.rcParams['figure.figsize'] = (12,8)

In [16]:
df_train = pd.read_csv('train.tsv', sep='\t', index_col=0)
df_test = pd.read_csv('test_nolabel.tsv', sep='\t', index_col=0)

In [17]:
X = df_train.drop(['sold_fast', 'properties', 'product_id', 'owner_id', 'name_text', 'desc_text', 'date_created'], axis=1)
y = df_train['sold_fast']

In [18]:
X.head()

Unnamed: 0,category_id,city,delivery_available,img_num,lat,long,payment_available,price,product_type,region,sold_mode,subcategory_id
1,4,Краснодар,False,3,45.0686,38.9518,True,500.0,1,Краснодарский край,1,410
2,4,Тюмень,False,2,57.184,65.5674,False,300.0,1,Тюменская область,1,405
4,9,Омск,True,1,54.9889,73.4312,True,1100.0,1,Омская область,1,908
6,3,Санкт-Петербург,False,4,59.959,30.4877,True,5000.0,1,Ленинградская область,1,312
10,5,Москва,False,2,55.6473,37.4118,True,2000.0,1,Московская область,1,504


In [19]:
def get_dict(df_column):
    cities = np.unique(df_column)
    city_dict = {}
    for i, city in enumerate(cities):
        city_dict[city] = i
    return city_dict

In [20]:
city_dict = get_dict(X.city.values)
X.loc[:, 'city'] = X.loc[:, 'city'].apply(lambda x: city_dict[x] if x in city_dict else len(city_dict) + 1)

region_dict = get_dict(X.region.values)
X.loc[:, 'region'] = X.loc[:, 'region'].apply(lambda x: region_dict[x] if x in region_dict else len(region_dict) + 1)

X.loc[:, 'delivery_available'] = X.loc[:, 'delivery_available'].apply(lambda x: 1 if x else 0)
X.loc[:, 'payment_available'] = X.loc[:, 'payment_available'].apply(lambda x: 1 if x else 0)

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [22]:
X_pt_1 = X[X.product_type == 1]
y_pt_1 = y[X.product_type == 1]

In [23]:
X_pt_1.shape, y_pt_1.shape

((344224, 12), (344224,))

In [24]:
X_pt_5 = X[X.product_type == 5]
y_pt_5 = y[X.product_type == 5]
X_pt_5.shape, y_pt_5.shape

((7056, 12), (7056,))

In [25]:
X_pt_1.head()

Unnamed: 0,category_id,city,delivery_available,img_num,lat,long,payment_available,price,product_type,region,sold_mode,subcategory_id
1,4,941,0,3,45.0686,38.9518,1,500.0,1,38,1,410
2,4,2025,0,2,57.184,65.5674,0,300.0,1,93,1,405
4,9,1444,1,1,54.9889,73.4312,1,1100.0,1,56,1,908
6,3,1703,0,4,59.959,30.4877,1,5000.0,1,45,1,312
10,5,1222,0,2,55.6473,37.4118,1,2000.0,1,50,1,504


In [28]:
X_train1, X_val1, y_train1, y_val1 = train_test_split(X_pt_1, y_pt_1, test_size=0.3, random_state=42)
X_train5, X_val5, y_train5, y_val5 = train_test_split(X_pt_5, y_pt_5, test_size=0.3, random_state=42)

In [29]:
clf1 = RandomForestClassifier(max_depth=5, n_estimators=100).fit(X_train1, y_train1)
clf5 = RandomForestClassifier(max_depth=5, n_estimators=100).fit(X_train5, y_train5)

In [34]:
y_pred1 = clf1.predict(X_val1)
y_pred5 = clf5.predict(X_val5)

In [35]:
from sklearn.metrics import roc_auc_score

In [37]:
roc_auc_score(y_val1, y_pred1), roc_auc_score(y_val5, y_pred5)

(0.5, 0.5)

In [38]:
from sklearn.model_selection import cross_validate

In [40]:
cv_results = cross_validate(RandomForestClassifier(), X_pt_1, y_pt_1, cv=5000)

In [None]:
cv_results5 = cross_validate(RandomForestClassifier(), X_pt_5, y_pt_5, cv=5000)