In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

%matplotlib inline

plt.style.use('seaborn-deep')
plt.rcParams['figure.figsize'] = (12,8)

In [55]:
df_train = pd.read_csv('train.tsv', sep='\t', index_col=0)
df_test = pd.read_csv('test_nolabel.tsv', sep='\t', index_col=0)

In [56]:
X = df_train.drop(['sold_fast', 'properties', 'product_id', 'owner_id', 'name_text', 'desc_text', 'date_created'], axis=1)
y = df_train['sold_fast']

In [61]:
def get_dict(df_column):
    cities = np.unique(df_column)
    city_dict = {}
    for i, city in enumerate(cities):
        city_dict[city] = i
    return city_dict

In [62]:
city_dict = get_dict(X.city.values)
X.loc[:, 'city'] = X.loc[:, 'city'].apply(lambda x: city_dict[x] if x in city_dict else len(city_dict) + 1)

region_dict = get_dict(X.region.values)
X.loc[:, 'region'] = X.loc[:, 'region'].apply(lambda x: region_dict[x] if x in region_dict else len(region_dict) + 1)

X.loc[:, 'delivery_available'] = X.loc[:, 'delivery_available'].apply(lambda x: 1 if x else 0)
X.loc[:, 'payment_available'] = X.loc[:, 'payment_available'].apply(lambda x: 1 if x else 0)

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [65]:
X_pt_1 = X[X.product_type == 1]
y_pt_1 = y[X.product_type == 1]

In [66]:
X_pt_1 = X_pt_1.drop(['product_type'], axis=1)

In [68]:
X_pt_5 = X[X.product_type == 5]
y_pt_5 = y[X.product_type == 5]
X_pt_5 = X_pt_5.drop(['product_type'], axis=1)
X_pt_5.shape, y_pt_5.shape

((7056, 118), (7056,))

In [70]:
X_train1, X_val1, y_train1, y_val1 = train_test_split(X_pt_1, y_pt_1, test_size=0.3, random_state=42)
X_train5, X_val5, y_train5, y_val5 = train_test_split(X_pt_5, y_pt_5, test_size=0.3, random_state=42)

In [71]:
clf1 = RandomForestClassifier(max_depth=5, n_estimators=100).fit(X_train1, y_train1)
clf5 = RandomForestClassifier(max_depth=5, n_estimators=100).fit(X_train5, y_train5)

In [72]:
y_pred1 = clf1.predict_proba(X_val1)[:, 1]
y_pred5 = clf5.predict_proba(X_val5)[:, 1]

In [73]:
from sklearn.metrics import roc_auc_score

In [74]:
roc_auc_score(y_val1, y_pred1), roc_auc_score(y_val5, y_pred5)

(0.5770304242679087, 0.563488851631304)