In [47]:
import pandas as pd
import numpy as np
import dask
import dask.dataframe as dd
from scipy.stats import norm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [48]:
df = dd.read_csv('9_23_hotness_01_percent.csv', assume_missing = True, blocksize = 64e6)

In [49]:
df.head()

Unnamed: 0,GUID,SESSION_SKEY,SITE_ID,SESSION_START_DT,ITEM_ID,MIN_SEQNUM,MIN_EVENT_TIMESTAMP,USER_ID,PLATFORM,asp_usd,...,CATEG_GMB_P1D_BEFORE_VI,TTL_BI_P1D_BEFORE_VI,TTL_GMB_P1D_BEFORE_VI,ITEM_BI_SHARE_IN_CATEG_P1D_BEFORE_VI,ITEM_GMB_SHARE_IN_CATEG_P1D_BEFORE_VI,CATEG_BI_SHARE_IN_TTL_P1D_BEFORE_VI,CATEG_GMB_SHARE_IN_TTL_P1D_BEFORE_VI,BI_INVERSE_FREQUENCY_SCORE,GMB_INVERSE_FREQUENCY_SCORE,rnk
0,9937d3881710a6e6f02cd2e4ff7ad788,38051180000000.0,3.0,2020-07-30,131157700000.0,43.0,2020-07-30 17:15:51,0.0,Browser: mWeb,5.0,...,153842.4,6525144.0,218836400.0,0.00017,0.000218,0.001804,0.000703,0.001074,0.001582,0.006
1,d4f39376166e79924e73a830012f8ea5,38056000000000.0,0.0,2020-08-05,313166500000.0,608.0,2020-08-05 07:18:45,1840615000.0,Apps: iPhone,60.0,...,412231.7,6624528.0,220254700.0,,,0.001694,0.001872,,,0.006
2,ace49b041640a9c0b93ba28affff02e4,38047620000000.0,0.0,2020-07-26,402320800000.0,31.0,2020-07-26 14:20:40,0.0,Browser: Core site,503.0,...,18810100.0,6174417.0,187188400.0,,,0.048301,0.100488,,,0.006
3,8891830716d4b9474906df10011cd54d,38051590000000.0,3.0,2020-07-31,174370800000.0,2454.0,2020-07-31 06:52:23,1123359000.0,Apps: iPhone,46.0,...,1332561.0,6367867.0,213346100.0,,,0.021025,0.006246,,,0.006
4,963d16581730aad74a104d04ffa8f650,38049170000000.0,3.0,2020-07-28,301699700000.0,1.0,2020-07-28 09:24:12,0.0,Browser: Core site,18.0,...,86965.73,7063671.0,237735400.0,,,0.000338,0.000366,,,0.006


In [50]:
# quickly filling out NaNs with zeroes, otherwise astype() won't work
df = df.fillna(0)

In [51]:
# dropping unused columns
to_drop =['ITEM_ID', 'GUID','SESSION_SKEY','SITE_ID', 'MIN_SEQNUM', 'MIN_SEQNUM', 'USER_ID', 'SEQNUM_RANK', 'show_cnt', 'rnk', 'SELLER_ID',\
         'SESSION_START_DT', 'MIN_EVENT_TIMESTAMP', 'SESSION_MIN_TIMESTAMP',\
         'active_item_yn']
df = df.drop(columns = to_drop, axis = 1)

In [53]:
# df['stp_yn'].value_counts().compute()
# df['TRX_DOMESTIC_YN'].unique().compute()

0       Other
1      Export
2    Domestic
3      Import
4        Intl
Name: TRX_DOMESTIC_YN, dtype: object

In [63]:
# Changing columns' types to make the dataframe more performant

type_mapping = {'PLATFORM': 'category', 'stp_yn': 'int8', 'map_yn': 'int8', 'mdm_yn': 'int8', 'sme_type_dt': 'int8', 'vi_coupon_show': 'int8',\
               'vp_yn': 'int8', 'return_accept_yn': 'int8', 'return_duration_in_vi': 'int8',\
               'qty_available': 'int16', 'qty_sold': 'int16', 'show_view_1hr_cnt': 'int16', 'show_view_24hr_cnt': 'int16',\
               'show_watch_1hr_cnt': 'int16', 'show_watch_24hr_cnt': 'int16','show_sld_1hr_cnt': 'int16', 'show_sld_24hr_cnt': 'int16',\
               'show_last_item_cnt': 'int16', 'show_limited_time_cnt': 'int16', 'show_limited_qty_cnt': 'int16' ,\
               'BIN_CNT': 'int16', 'BID_CNT': 'int16','OFFER_CNT': 'int16', 'WATCH_CNT': 'int16', 'ASQ_CNT': 'int16',\
               'BBO_CNT': 'int16', 'BBOC_CNT': 'int16', 'CONV_YN': 'int8', 'MQ_YN': 'int8', 'FP_YN': 'int8', 'BO_YN': 'int8',\
               'FREE_SHIPPING_YN': 'int8', 'ITEM_CONDITION': 'category', 'ITEM_CONDITION_DETAIL': 'category', 'META_CATEG_NAME': 'category',\
               'CATEG_LVL2_NAME': 'category','VERTICAL': 'category', 'RETURN_ACCPT_YN_NEW': 'int8', 'RETURN_TYPE': 'category', 'PHOTO_CNT': 'int8',\
               'AUCT_START_DT': 'datetime64[ns]', 'LIST_AGE_DAY': 'int16', 'BUYER_SEGMENT': 'category', 'BUYER_SEC_FROM_SESSION_START_TO_VI': 'int16' ,\
               'BUYER_CNTRY': 'category', 'SITE_NAME': 'category', 'SELLER_SEGMENT': 'category', 'SELLER_ETRS_YN':'category',\
               'SELLER_CNTRY': 'category','TRX_DOMESTIC_YN': 'category', 'SELLER_GREATER_CHINA_YN': 'category'}
df = df.astype(type_mapping)

In [None]:
# handle times separately: SESSION_START_DT, MIN_EVENT_TIMESTAMP, SESSION_MIN_TIMESTAMP
# type_to_date = ['SESSION_START_DT', 'MIN_EVENT_TIMESTAMP', 'SESSION_MIN_TIMESTAMP']
# df[type_to_date] = df[type_to_date].apply(pd.to_datetime, axis = 1, errors = 'ignore',\
#                                          meta={'SESSION_START_DT': 'object', 'MIN_EVENT_TIMESTAMP': 'object', 'SESSION_MIN_TIMESTAMP': 'object'})

In [64]:
df.dtypes.to_dict()

{'PLATFORM': CategoricalDtype(categories=['__UNKNOWN_CATEGORIES__'], ordered=False),
 'asp_usd': dtype('float64'),
 'stp_yn': dtype('int8'),
 'map_yn': dtype('int8'),
 'mdm_yn': dtype('int8'),
 'sme_type_dt': dtype('int8'),
 'vi_coupon_show': dtype('int8'),
 'vp_yn': dtype('int8'),
 'VP_PILL_CNT': dtype('float64'),
 'return_accept_yn': dtype('int8'),
 'return_duration_in_vi': dtype('int8'),
 'qty_available': dtype('int16'),
 'qty_sold': dtype('int16'),
 'show_view_1hr_cnt': dtype('int16'),
 'show_view_24hr_cnt': dtype('int16'),
 'show_watch_1hr_cnt': dtype('int16'),
 'show_watch_24hr_cnt': dtype('int16'),
 'show_sld_1hr_cnt': dtype('int16'),
 'show_sld_24hr_cnt': dtype('int16'),
 'show_last_item_cnt': dtype('int16'),
 'show_limited_time_cnt': dtype('int16'),
 'show_limited_qty_cnt': dtype('int16'),
 'BIN_CNT': dtype('int16'),
 'BID_CNT': dtype('int16'),
 'OFFER_CNT': dtype('int16'),
 'WATCH_CNT': dtype('int16'),
 'ASQ_CNT': dtype('int16'),
 'ATC_CNT': dtype('float64'),
 'BBO_CNT': dtyp

In [65]:
df.head()

Unnamed: 0,PLATFORM,asp_usd,stp_yn,map_yn,mdm_yn,sme_type_dt,vi_coupon_show,vp_yn,VP_PILL_CNT,return_accept_yn,...,CATEG_BI_P1D_BEFORE_VI,CATEG_GMB_P1D_BEFORE_VI,TTL_BI_P1D_BEFORE_VI,TTL_GMB_P1D_BEFORE_VI,ITEM_BI_SHARE_IN_CATEG_P1D_BEFORE_VI,ITEM_GMB_SHARE_IN_CATEG_P1D_BEFORE_VI,CATEG_BI_SHARE_IN_TTL_P1D_BEFORE_VI,CATEG_GMB_SHARE_IN_TTL_P1D_BEFORE_VI,BI_INVERSE_FREQUENCY_SCORE,GMB_INVERSE_FREQUENCY_SCORE
0,Browser: mWeb,5.0,0,0,0,0,0,1,7.0,1,...,11770.0,153842.4,6525144.0,218836400.0,0.00017,0.000218,0.001804,0.000703,0.001074,0.001582
1,Apps: iPhone,60.0,0,0,0,0,0,0,0.0,1,...,11222.0,412231.7,6624528.0,220254700.0,0.0,0.0,0.001694,0.001872,0.0,0.0
2,Browser: Core site,503.0,0,0,0,0,0,0,0.0,1,...,298229.0,18810100.0,6174417.0,187188400.0,0.0,0.0,0.048301,0.100488,0.0,0.0
3,Apps: iPhone,46.0,0,0,0,0,0,0,0.0,0,...,133887.0,1332561.0,6367867.0,213346100.0,0.0,0.0,0.021025,0.006246,0.0,0.0
4,Browser: Core site,18.0,0,0,0,0,0,0,0.0,1,...,2389.0,86965.73,7063671.0,237735400.0,0.0,0.0,0.000338,0.000366,0.0,0.0


In [None]:
# Quick pairwise correlations
corrmat = df.corr()
f, ax = plt.subplots(figsize = (22, 15))
sns.heatmap(corrmat, vmax = 0.8, center = 0, square = True, annot = True, cmap = plt.cm.Reds)
plt.show()

In [None]:
# barchart (change to the right variable)
plt.figure(figsize = (16,6))
corrmat['gmb_usd_amt_n3m'].sort_values(ascending = False)[1:].plot(kind = 'bar')  # update the col
plt.tight_layout()

In [None]:
# Distributions of numeric features:
# Finding numeric features
numeric_vars = df.select_dtype(exclude = ['category', 'object'])

fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 120))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)

# skiped Id and saleprice feature
for i in range(1, len(numeric_vars.columns)):
    feature = numeric_vars.columns[i]
    plt.subplot(len(numeric_vars.columns), 3, i)
    sns.scatterplot(x=feature, y='SalePrice', data=df)  # update y
    plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
    plt.ylabel('SalePrice', size=15, labelpad=12.5)  # update the name         
plt.show()

In [None]:
# Checking cardinality of categorical variables
categorical_vars = df.select_dtype(include = 'category')
for c in categorical_vars:
    df[c].value_count(normalize = True).plot()
"""
os_mask = [2,1,3,4]
df2['new_os'] = df2['OperatingSystems'].apply(lambda x: 'other' if x not in os_mask else x)

df2['new_os'].loc[~df2['OperatingSystems'].isin(os_mask)].unique() # checking that it was applied to the right rows
"""

In [None]:
# checking NaNs once again
df.isna().sum()

# features by type
categorical_vars = df.select_dtype(include = 'category')
numeric_vars = df.select_dtype(exclude = ['category', 'object'])

# checking normality of some of the continious features
sns.distplot(df['gmb_usd_amt_n3m'], fit = norm)
fig = plt.figure()
res = st.probplot(df['gmb_usd_amt_n3m'], plot=plt)



# for categories: dummies
# for cont: normalization 

categorical_vars = df.select_dtype(include = 'category')
numeric_vars = df.select_dtype(exclude = ['category', 'object'])

categorical_transformer = Pipeline(steps = [('onehot', OneHotEncoder(handle_unknown = 'ignore'))])
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()),])

# Combine the result
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_vars),
        ('cat', categorical_transformer, categorical_vars)
    ]
)

# the main pipe
clf = Pipeline([('preprocessor', preprocessor), ('clf', RandomForestClassifier)])

#numpy arrays for scikit-learn
X, y = 

# train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fitting and training
clf.fit(X_train, y_train)