In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import random

In [2]:
data_frame = pd.read_csv('data/train.csv')

#### Data Cleaning

In [3]:
# drop ID column
data_frame.drop(['ID'], axis=1, inplace=True)

# there are 200k unique siteids and 850k unique offerid. 
# these are too many in a categorical classification.
# So, I discard these two variables.
data_frame.drop(['siteid', 'offerid'], axis=1, inplace=True)

# make two columns for day and hour from datetime
data_frame['day']=pd.to_datetime(data_frame['datetime']).apply(lambda x: x.dayofweek)
data_frame['hour']=pd.to_datetime(data_frame['datetime']).apply(lambda x: x.hour)
data_frame.drop('datetime', axis=1, inplace=True)

# replacing duplicates
data_frame['browserid'].replace(to_replace=['Mozilla Firefox', 'Mozilla'], 
  value='Firefox', inplace=True) # Firefox
data_frame['browserid'].replace(to_replace=['Google Chrome'], 
  value='Chrome', inplace=True) # Chrome
data_frame['browserid'].replace(to_replace=['InternetExplorer', 'Internet Explorer'], 
  value='IE', inplace=True) # IE

In [4]:
# which day had most clicks? Is it in the weekends? 

print('Monday is 0')
for day in [0,1,2,3,4,5,6]:
    count = data_frame.loc[(data_frame['click']==1) & (data_frame['day']==day)].shape[0]
    print('Number of click on {} is {}'.format(day, count))

Monday is 0
Number of click on 0 is 41000
Number of click on 1 is 81403
Number of click on 2 is 81758
Number of click on 3 is 74691
Number of click on 4 is 75108
Number of click on 5 is 46646
Number of click on 6 is 36608


In [5]:
# weekdays have high click rate, so let's break if it's weekdays or weekend
data_frame['weekdays']=data_frame['day'].apply(lambda x:1 if x<5 else 0)

data_frame.drop('day', axis=1, inplace=True)

In [6]:
# which hours had most clicks? Is it in the mornings? evenings? 

for hour in range(24):
    count = data_frame.loc[(data_frame['click']==1) & (data_frame['hour']==hour)].shape[0]
    print('Number of click at {} is {}'.format(hour, count))

Number of click at 0 is 8099
Number of click at 1 is 4895
Number of click at 2 is 3632
Number of click at 3 is 3410
Number of click at 4 is 2793
Number of click at 5 is 3353
Number of click at 6 is 6337
Number of click at 7 is 15110
Number of click at 8 is 19183
Number of click at 9 is 23707
Number of click at 10 is 32727
Number of click at 11 is 28216
Number of click at 12 is 27003
Number of click at 13 is 27756
Number of click at 14 is 27732
Number of click at 15 is 27104
Number of click at 16 is 28156
Number of click at 17 is 29708
Number of click at 18 is 23246
Number of click at 19 is 21792
Number of click at 20 is 21817
Number of click at 21 is 22319
Number of click at 22 is 17577
Number of click at 23 is 11542


In [7]:
# Most clicks are between 9am - 9pm, so I make it binary

data_frame['daytime']=data_frame['hour'].apply(lambda x:1 if 9<=x<=21 else 0)

data_frame.drop('hour', axis=1, inplace=True)

In [8]:
# finding most frequenct value of each colum for each class
# this is the first round of filling NAs. Filling based on the most frequent value for each class
# This is a good starting point 
# remember to save indecies of nans before filling them

class_click = data_frame[data_frame['click']==1]
class_no_click = data_frame[data_frame['click']==0]
fill_click_dic = {}    
fill_no_click_dic = {}

def replace_most_freq(col):   
    
    a = class_click[col].value_counts()
    fill_click = a[a.values==a.values.max()].index[0]
    
    b = class_no_click[col].value_counts()
    fill_no_click = b[b.values==b.values.max()].index[0]
    
    fill_click_dic[col] = fill_click
    fill_no_click_dic[col] = fill_no_click
    
    # print('for class click most used value for {} is {}'.format(col, fill_click))
    # print('for class no-click most used value for {} is {}'.format(col, fill_no_click))
    
    data_frame[data_frame['click']==1] = class_click.fillna(value=fill_click)
    data_frame[data_frame['click']==0] = class_no_click.fillna(value=fill_no_click)
 

# saving indecies with NAN value
nans_index = {}
def find_nans_index(col):
    nans_index[col]=data_frame[pd.isnull(data_frame[col])==True].index

    

In [9]:
columns = ['category' ,'merchant', 'countrycode', 'browserid', 'devid']
for col in columns:
    find_nans_index(col)
    replace_most_freq(col)


In [10]:
# convert to binaries

categorical_variables = ['countrycode', 'browserid', 'devid']

for variable in categorical_variables:
    dummies = pd.get_dummies(data_frame[variable], prefix=variable)
    data_frame = pd.concat([data_frame, dummies], axis=1)
    data_frame.drop([variable], axis=1, inplace=True)

In [11]:
pd.unique(data_frame['category']).size

271

In [12]:
# let's plot occurence for each category:

counts = data_frame['category'].value_counts()
plot_pd = pd.Series(counts.values, index=counts.index)
plot_pd.sort_values(inplace=True, ascending=False)
#plt.subplots(figsize=(15,15))
#plot_pd.plot(kind='bar', figsize=(10,10))
#plt.show()

In [13]:
# let's take only categories that constitute 70% of the entire table
# this shows that of 271 categories 46 categories fill 70% of the table

sum_ = 0
i = 0
while sum_<0.7*data_frame.shape[0]:
    sum_ += plot_pd.values[i]
    i += 1
print(i)

46


In [14]:
category_90p = plot_pd[:46]

In [15]:
data_frame['category']= data_frame['category'].apply(lambda x:category_90p.index[random.randint(0,45)] 
                                                     if x not in category_90p.index else x)

In [16]:
# let's do similar approach for merchant
# of 697 merchant, 74 merchant constitute 70% of the entire table.
# so let's randomly replace the 30% rest with 74 values

counts = data_frame['merchant'].value_counts()
plot_pd = pd.Series(counts.values, index=counts.index)
plot_pd.sort_values(inplace=True, ascending=False)

sum_ = 0
i = 0
while sum_<0.7*data_frame.shape[0]:
    sum_ += plot_pd.values[i]
    i += 1
print(i)

74


In [17]:
merchant_70p = plot_pd[:74]

In [18]:
data_frame['merchant']= data_frame['merchant'].apply(lambda x:merchant_70p.index[random.randint(0,73)] 
                                                     if x not in merchant_70p.index else x)

In [19]:
# convert to binaries

categorical_variables = ['category', 'merchant']

for variable in categorical_variables:
    dummies = pd.get_dummies(data_frame[variable], prefix=variable)
    data_frame = pd.concat([data_frame, dummies], axis=1)
    data_frame.drop([variable], axis=1, inplace=True)

In [20]:
y = data_frame.pop('click')

In [None]:
rfc = RandomForestClassifier(n_estimators=400, max_features='sqrt', n_jobs=-1,
                             criterion='entropy', oob_score=True, min_samples_leaf=50)

In [None]:
rfc_model = rfc.fit(data_frame, y)

In [None]:
import pickle
import os
with open('./trained_model', 'wb') as f:
    pickle.dump(rfc_model, f)

In [None]:
rfc_model.oob_score_

In [21]:
rfc_model = pickle.load(open('./trained_model', 'rb'))

#### Reading & cleaning

In [22]:
test_frame = pd.read_csv('data/test.csv')

In [23]:

test_frame.drop(['siteid', 'offerid'], axis=1, inplace=True)

# make two columns for day and hour from datetime
test_frame['day']=pd.to_datetime(test_frame['datetime']).apply(lambda x: x.dayofweek)
test_frame['hour']=pd.to_datetime(test_frame['datetime']).apply(lambda x: x.hour)
test_frame.drop('datetime', axis=1, inplace=True)

# replacing duplicates
test_frame['browserid'].replace(to_replace=['Mozilla Firefox', 'Mozilla'], 
  value='Firefox', inplace=True) # Firefox
test_frame['browserid'].replace(to_replace=['Google Chrome'], 
  value='Chrome', inplace=True) # Chrome
test_frame['browserid'].replace(to_replace=['InternetExplorer', 'Internet Explorer'], 
  value='IE', inplace=True) # IE

In [24]:
# weekdays have high click rate, so let's break if it's weekdays or weekend
test_frame['weekdays']=test_frame['day'].apply(lambda x:1 if x<5 else 0)

test_frame.drop('day', axis=1, inplace=True)

In [25]:
# Most clicks are between 9am - 9pm, so I make it binary

test_frame['daytime']=test_frame['hour'].apply(lambda x:1 if 9<=x<=21 else 0)

test_frame.drop('hour', axis=1, inplace=True)

In [26]:
fill_click_dic = {'category': 6906,
 'merchant': 99510044,
 'countrycode': 'c',
 'browserid': 'InternetExplorer',
 'devid': 'Desktop'}

In [27]:
fill_no_click_dic = {'category': 40339,
 'merchant': 99510044,
 'countrycode': 'b',
 'browserid': 'Edge',
 'devid': 'Mobile'}

In [28]:
# find rows with nans

nan_id = test_frame.apply(lambda x: x.name if True in pd.isnull(x).values else np.nan, axis=1)

# then convert to a array

nan_id.dropna(axis=0, inplace=True)

nan_id = nan_id.values


In [29]:
nan_id.shape

(884448,)

In [30]:
# put all nans in a seperate dataframe

nan_test_frame = test_frame.iloc[nan_id.astype(np.int64)]


In [31]:
# make copy of for 0 clicks - One copy will be filled with most frequent values of class=1
# and the other will be filled with most frequent values for class=0

nan_test_frame_0 = nan_test_frame.copy()

In [32]:
# dropping nans from the test_frame

test_frame = test_frame.drop(nan_id.astype(np.int64))


In [33]:
# filling with most frequenct clicked and non-clicked values for each col

columns = ['category' ,'merchant' ,'countrycode' ,'browserid' ,'devid']

for col in columns:
    nan_test_frame[col].fillna(value=fill_click_dic[col], inplace=True)
    nan_test_frame_0[col].fillna(value=fill_no_click_dic[col], inplace=True)

In [34]:
# convert to binaries

categorical_variables = ['countrycode', 'browserid', 'devid']

for variable in categorical_variables:
    dummies = pd.get_dummies(test_frame[variable], prefix=variable)
    test_frame = pd.concat([test_frame, dummies], axis=1)
    test_frame.drop([variable], axis=1, inplace=True)
    
for variable in categorical_variables:
    dummies = pd.get_dummies(nan_test_frame[variable], prefix=variable)
    nan_test_frame = pd.concat([nan_test_frame, dummies], axis=1)
    nan_test_frame.drop([variable], axis=1, inplace=True)
    
for variable in categorical_variables:
    dummies = pd.get_dummies(nan_test_frame_0[variable], prefix=variable)
    nan_test_frame_0 = pd.concat([nan_test_frame_0, dummies], axis=1)
    nan_test_frame_0.drop([variable], axis=1, inplace=True)
    

In [35]:
# top most frequent values in category and merchant to replace others
test_frame['category']= test_frame['category'].apply(lambda x:category_90p.index[random.randint(0,45)] 
                                                     if x not in category_90p.index else x)

nan_test_frame['category']= nan_test_frame['category'].apply(lambda x:category_90p.index[random.randint(0,45)] 
                                                     if x not in category_90p.index else x)

nan_test_frame_0['category']= nan_test_frame_0['category'].apply(lambda x:category_90p.index[random.randint(0,45)] 
                                                     if x not in category_90p.index else x)

In [36]:
# let's do for merchant too

test_frame['merchant']= test_frame['merchant'].apply(lambda x:merchant_70p.index[random.randint(0,73)] 
                                                     if x not in merchant_70p.index else x)

nan_test_frame['merchant']= nan_test_frame['merchant'].apply(lambda x:merchant_70p.index[random.randint(0,73)] 
                                                     if x not in merchant_70p.index else x)

nan_test_frame_0['merchant']= nan_test_frame_0['merchant'].apply(lambda x:merchant_70p.index[random.randint(0,73)] 
                                                     if x not in merchant_70p.index else x)

In [37]:
# convert to binaries

categorical_variables = ['category', 'merchant']

for variable in categorical_variables:
    dummies = pd.get_dummies(test_frame[variable], prefix=variable)
    test_frame = pd.concat([test_frame, dummies], axis=1)
    test_frame.drop([variable], axis=1, inplace=True)
    
for variable in categorical_variables:
    dummies = pd.get_dummies(nan_test_frame[variable], prefix=variable)
    nan_test_frame = pd.concat([nan_test_frame, dummies], axis=1)
    nan_test_frame.drop([variable], axis=1, inplace=True)
    
for variable in categorical_variables:
    dummies = pd.get_dummies(nan_test_frame_0[variable], prefix=variable)
    nan_test_frame_0 = pd.concat([nan_test_frame_0, dummies], axis=1)
    nan_test_frame_0.drop([variable], axis=1, inplace=True)
    

In [38]:
# drop IDs
test_frame_ids = test_frame.pop('ID')
nan_test_frame_ids = nan_test_frame.pop('ID')
nan_test_frame_0_ids = nan_test_frame_0.pop('ID')

In [39]:
# make columns match
base = data_frame.columns

In [40]:
# test frame doesn't have any browserid_desktop and Mobile let's fill with 0 then

test_frame['browserid_Desktop']=0
test_frame['browserid_Mobile']=0
test_frame = test_frame[base]

In [41]:
nan_test_frame['browserid_Desktop']=0
nan_test_frame['browserid_Mobile']=0
nan_test_frame = nan_test_frame[base]

nan_test_frame_0['browserid_Desktop']=0
nan_test_frame_0['browserid_Mobile']=0
nan_test_frame_0 = nan_test_frame_0[base]

In [42]:
y_test = rfc_model.predict_proba(test_frame)

In [43]:
y_nan_test = rfc_model.predict_proba(nan_test_frame)
y_nan_test_0 = rfc_model.predict_proba(nan_test_frame_0)

pred_nan_test = rfc_model.predict(nan_test_frame)
pred_nan_test_0 = rfc_model.predict(nan_test_frame_0)



In [151]:
def save_test_results(ID, y, file_name):
    subm_frame = pd.DataFrame(y, index=ID.ravel(), columns=['click'])
    subm_frame.index.name = 'ID'
    subm_frame.to_csv('data/' + file_name)
    
    return subm_frame


# save results for submission

IDs = test_frame_ids.append(nan_test_frame_0_ids)
y_pred = np.append(y_test[:,1], y_nan_test_0[:,1])


subm_model = save_test_results(IDs, y_pred, 'submit_aug10_2.csv')

#### This was the end. From below I just want to see which filling for NANs had the most votes among trees. Looks like they gave similar.

In [44]:
# we need to know the majority vote for those who have different prediction
# among 400 trees, we will examine which class has the highest vote. 

trees = rfc_model.estimators_

In [45]:
# finding indecies that are different

ind_diff = []
for i in range(len(pred_nan_test)):
    if pred_nan_test[i]!=pred_nan_test_0[i]:
        ind_diff.append(i)
 

In [46]:
diff_test_frame = nan_test_frame.iloc[ind_diff]
diff_test_frame_0 = nan_test_frame_0.iloc[ind_diff]

In [59]:
# prediction of each tree on the test_frame_filled_with most frequenct click
trees_pred = []
for tree in trees:
    trees_pred.append(tree.predict(nan_test_frame))

    
# prediction of each tree on the test_frame_filled_with most frequenct non-click
trees_pred_0 = []
for tree in trees:
    trees_pred_0.append(tree.predict(nan_test_frame_0))
    

trees_pred = np.array(trees_pred)
trees_pred_0 = np.array(trees_pred_0)

In [62]:
trees_pred.shape

(400, 884448)

In [102]:
click_votes = np.sum(trees_pred, axis=0)
click_no_votes = 400 - click_votes

click_votes_0 = np.sum(trees_pred, axis=0)
click_no_votes_0 = 400 - click_votes


In [105]:
# [filled_with_max_click_vote_yes, filled_with_max_click_vote_no,
# filled_with_max_not_click_vote_yes, filled_with_max_not_click_vote_no]

votes_mat = np.array([click_votes, click_no_votes, click_votes_0, click_no_votes_0])

In [106]:
index_max_votes = np.argmax(votes_mat, axis=0)