In [2]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn import ensemble



In [3]:
# This function converts categorical variables to dummy variable with 0-1 values. 
def category_to_dummies(df, which_column, delete_original=False):
    
    class_dummies = pd.get_dummies(df[which_column],prefix=which_column)
    # adding dummy variables
    combined = pd.concat([df,class_dummies],axis=1)
    #if delete_original == True:
        # removing the original feature
    combined.drop(which_column,axis=1,inplace=True)
    return combined

In [4]:
def set_user_func(row):
    if row['device_id'] == 'a99f214a':
        return row['device_ip'] + row['device_model']
    else:
        return row['device_id']

In [20]:
def setup_predictor(filename):
    # Read the training dataset
    df = pd.read_csv(filename,low_memory=False)
    print("Training dataset read successfully.")
    # Display dataframe 
    # pd.set_option('display.max_columns', None)
    # df.head()    

    # Add new features to the dataframe 
    # Define user as:
    df['user'] = df.apply(set_user_func, axis=1)

    df['catgs'] = df['C14'].apply(str) + "_" + df['C15'].apply(str) + "_" + df['C16'].apply(str) + "_" + \
                df['C17'].apply(str) + "_" + df['C18'].apply(str) + "_" + df['C19'].apply(str) + "_" + \
                df['C20'].apply(str) + "_" + df['C21'].apply(str)  + "_" + df['C1'].apply(str) 
    df['catgs'] = df['catgs'].apply(hash) % 1000
    df['ad_on_website'] = df['site_id'].apply(lambda s: 0 if s == '85f751fd' else 1)
    df['ad_on_app'] = df['app_id'].apply(lambda s: 0 if s == 'ecad2386' else 1)    


    # Add timeofday
    # Change hour to time of day: Morning, Afternoon, Evening, and Night
    df['hour'] = df['hour'].astype(str)
    # Morning, Afternoon, Evening, Night
    # df['timeofday'] = df['hour'].apply(lambda s: "Morning" if int(s[6:8]) >= 6 and int(s[6:8]) < 12 else "Afternoon" if int(s[6:8]) >=12 and int(s[6:8]) < 18 else "Evening" if int(s[6:8]) >=18 and int(s[6:8]) < 24 else "Night")    
    df['timeofday'] = df['hour'].apply(lambda s: int(s[6:8]))


    df['device_id_count'] = df.groupby(['device_id'])['device_id'].transform('count')
    df['device_ip_count'] = df.groupby(['device_ip'])['device_ip'].transform('count')
    df['user_count'] = df.groupby(['user'])['user'].transform('count')
    df['app_id_count'] = df.groupby(['app_id'])['app_id'].transform('count')
    df['app_category_count'] = df.groupby(['app_category'])['app_category'].transform('count')
    df['site_category_count'] = df.groupby(['site_category'])['site_category'].transform('count')

    features = ['click', 'banner_pos', 'user_count', 'device_id_count', 'device_ip_count', 'app_id_count', 'app_category_count', 'site_category_count']

    data = df[features]    
    delete_original_col = True
    data = category_to_dummies(data, 'banner_pos', delete_original_col)
    #data = category_to_dummies(data, 'device_type', delete_original_col)    
    
    # extract training data and cross-validation datasets from input data
    split_factor = float(0.2); 
    train, cv_data = train_test_split(data, test_size = split_factor)    
    
    # setup the classifier
    feature_names = list(train.columns.values)
    feature_names.remove('click')
    # print(feature_names)
    X_train = train[feature_names]
    Y_train = train['click']
    X_cv = cv_data[feature_names]
    Y_cv = cv_data['click']    

    estimators = []
#    model1 = ExtraTreesClassifier(random_state=1, criterion='entropy', n_estimators=10, n_jobs=-1)
#    estimators.append(('et', model1))
#    print("Extra trees setup done.")

#    model2 = GradientBoostingClassifier(random_state=1, n_estimators=10, max_depth=1)
#    estimators.append(('gb', model2))
#    print("Gradient boosting setup done.")

#    model3 = RandomForestClassifier(max_features='sqrt',max_depth=100,criterion='gini',oob_score=True, class_weight="balanced")
#    estimators.append(('rf', model3))
#    print("Random forest setup done.")

#    classifier = VotingClassifier(estimators)
#    classifier = GradientBoostingClassifier(random_state=1, n_estimators=100, max_depth=2)
    classifier = RandomForestClassifier(max_features='sqrt',max_depth=200,criterion='entropy',oob_score=True, class_weight="balanced")
    classifier.fit(X_train, Y_train)
    print('Training accuracy%: Training dataset: ' + str(classifier.score(X_train,Y_train)*100))
    print('CV accuracy%: CV dataset: ' + str(classifier.score(X_cv,Y_cv)*100))        

    return classifier, df, X_train, Y_train, X_cv, Y_cv

In [6]:
def predict_(classifier, X):
    Y_pred = classifier.predict(X)
    return Y_pred

In [7]:
def score_analysis(Y_pred, Y_true):
    print(classification_report(Y_true, Y_pred))    


In [8]:
if __name__ == "__main__":

    filename = "train.zip"
    classifier, df, X_train, Y_train, X_cv, Y_cv = setup_predictor(filename)
    Y_pred = predict_(classifier, X_cv)
    score_analysis(Y_pred, Y_cv)

Training dataset read successfully.


  warn("Some inputs do not have OOB scores. "


Training accuracy%: Training dataset: 64.4609555762
CV accuracy%: CV dataset: 62.8365
             precision    recall  f1-score   support

          0       0.91      0.61      0.74    167894
          1       0.26      0.70      0.38     32106

avg / total       0.81      0.63      0.68    200000



In [21]:
if __name__ == "__main__":

    filename = "train.zip"
    classifier, df, X_train, Y_train, X_cv, Y_cv = setup_predictor(filename)
    Y_pred = predict_(classifier, X_cv)
    score_analysis(Y_pred, Y_cv)

Training dataset read successfully.
Training accuracy%: Training dataset: 65.0150812689
CV accuracy%: CV dataset: 63.497
             precision    recall  f1-score   support

          0       0.92      0.62      0.74    168070
          1       0.26      0.70      0.38     31930

avg / total       0.81      0.63      0.68    200000



In [15]:
def report_feature_importance(classifier, train):
    features = pd.DataFrame()
    features['feature'] = train.columns
    features['importance'] = classifier.feature_importances_
    print(features.sort(['importance'],ascending=False))

In [16]:
report_feature_importance(classifier, X_train)

                feature    importance
2       device_ip_count  3.179018e-01
3          app_id_count  2.335749e-01
0            user_count  1.824887e-01
5   site_category_count  1.559595e-01
4    app_category_count  6.564179e-02
1       device_id_count  2.509227e-02
7          banner_pos_1  9.867268e-03
6          banner_pos_0  9.022475e-03
8          banner_pos_2  1.925580e-04
11         banner_pos_5  1.592784e-04
12         banner_pos_7  7.210593e-05
10         banner_pos_4  2.649539e-05
9          banner_pos_3  8.854101e-07




In [114]:
#def monte_carlo_analysis(n_click=323209,n_total=1999999, Y_true):
def monte_carlo_analysis(Y_true, CTR_ratio):
    n_total = len(Y_true)
    n_click = int(n_total * CTR_ratio)
    import random
    indx = random.sample(range(0, n_total), n_click)
    Y_sim = np.zeros(n_total)
    for index in indx:
        Y_sim[index] = 1
    score_analysis(Y_sim, Y_true)    

In [115]:
# Compare with random assignment
CTR = float(len(df[df['click'] == 1])/float(len(df)))
print("Click percentage in the originial dataset: " + str(CTR*100))
monte_carlo_analysis(Y_cv, CTR)



Click percentage in the originial dataset: 16.0219160219
             precision    recall  f1-score   support

          0       0.84      0.84      0.84    167981
          1       0.16      0.16      0.16     32019

avg / total       0.73      0.73      0.73    200000



In [25]:
len(df['catgs'].unique())

4202

In [26]:
len(df)

999999

In [85]:
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,C18,C19,C20,C21,user_,user,catgs,ad_on_website,ad_on_app,timeofday
0,1000009418151094273,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,0,35,-1,79,ddd2926e44956a24,5395,2,1,0,0
1,10000169349117863715,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,0,35,100084,79,96809ac8711ee120,3420,9,1,0,0
2,10000371904215119486,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,0,35,100084,79,b3cf8def8a4875bd,5184,9,1,0,0
3,10000640724480838376,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,0,35,100084,79,e8275b8f6332421a,2691,7,1,0,0
4,10000679056417042096,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,0,35,-1,157,9644d0bf779d90c2,5412,5,1,0,0


In [122]:
df['comb'] = df['site_domain'] + df['site_category']
df['comb'] = df['comb'].apply(hash) % 1000
df['comb'].groupby(df['click']).value_counts()

click  comb
0      682     307992
       889     181937
       898      24697
       708      23795
       169      19538
       591      15524
       167      12440
       517      12226
       731      12008
       256       7377
       762       6747
       668       6687
       559       6293
       530       6292
       777       5724
       978       5429
       471       5063
       374       4951
       602       4627
       962       4578
       792       4532
       901       4129
       398       4000
       354       3933
       148       3783
       406       3674
       412       3419
       33        3374
       843       3107
       891       3064
       211       2882
       437       2857
       997       2801
       65        2743
       954       2690
       524       2677
       390       2602
       597       2570
       451       2540
       422       2042
       564       2037
       317       2028
       993       2025
       667       1904
       511       174

In [123]:
pd.set_option('display.max_rows', None)
#df['device_id'].groupby(df['click']).value_counts().hist()
df['device_id'].groupby(df['click']).value_counts()

click  device_id
0      a99f214a     697917
       c357dbff        731
       936e92fb        459
       31da1bd0        292
       787d2bb0        274
       e8440dbf        246
       b09da1c4        214
       f0b5276b        195
       d857ffbb        172
       dcefb131        155
       f765372d        135
       7166f9fd        133
       79b5916c        128
       045d057f        121
       c810e66f        118
       c07b6494        116
       e09ded8f        115
       332d28f2        113
       8bd5456a        113
       a167aa83        112
       3c19ea8f        109
       9d7b8b52        109
       09ecc3b0        108
       6e30eb86        106
       da3579c6        104
       0cc497c8        101
       5140d8aa        100
       104811b5         99
       0b68504c         98
       ff2a4c55         98
       0e60b58d         97
       027a3ad9         96
       50c4ddcc         96
       cf466a39         94
       59695e83         93
       422b5f52         91
       afef

In [None]:
df['click'].hist(by=df['device_id'])


KeyboardInterrupt

