In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc

In [7]:
df_train = pd.read_csv('/Users/dysson/Downloads/train.gz',compression='gzip')
df_test = pd.read_csv('/Users/dysson/Downloads/test.gz',compression='gzip')

In [None]:
df_train.head()

In [None]:
df_test.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000017e+19,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,8330,320,50,761,3,175,100075,23
1,1.000018e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
2,1.000055e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
3,1.000109e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,...,1,0,18648,320,50,1092,3,809,100156,61
4,1.000138e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,...,1,0,23160,320,50,2667,0,47,-1,221


In [14]:
# Define X and y
X = df_train.loc[:, ~df_train.columns.isin(['click'])]
y = df_train.click

In [13]:
# Sample CTR
print("Sample CTR :\n", 
      round(y.sum()/len(y),4))

Sample CTR :
 0.1698


In [15]:
# Distribution of values for device type
print("Distribution of device type: ")
print(X.device_type.value_counts()/len(X))

Distribution of device type: 
device_type
1    9.227213e-01
0    5.493121e-02
4    1.915142e-02
5    3.195357e-03
2    7.667769e-07
Name: count, dtype: float64


In [17]:
# Sample CTR by device type 
print("CTR by device type: ")
print(df_train.groupby('device_type')['click'].sum()/len(y))

CTR by device type: 
device_type
0    1.157573e-02
1    1.561021e-01
2    4.946948e-08
4    1.827897e-03
5    2.998593e-04
Name: click, dtype: float64


In [18]:
# Distribution of values for banner position
print("Distribution of banner position: ")
print(X.banner_pos.value_counts()/len(X))

Distribution of banner position: 
banner_pos
0    0.720018
1    0.278199
7    0.001078
2    0.000322
4    0.000191
5    0.000143
3    0.000050
Name: count, dtype: float64


In [19]:
# Sample CTR by banner position
print("CTR by banner position: ")
print(df_train.groupby('banner_pos')['click'].sum()/len(y))

CTR by banner position: 
banner_pos
0    0.118279
1    0.051081
2    0.000038
3    0.000009
4    0.000035
5    0.000017
7    0.000345
Name: click, dtype: float64


### baseline model

In [None]:
X.head()

0    44956a24
1    711ee120
2    8a4875bd
3    6332421a
4    779d90c2
Name: device_model, dtype: object

In [26]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40428967 entries, 0 to 40428966
Data columns (total 23 columns):
 #   Column            Dtype  
---  ------            -----  
 0   id                float64
 1   hour              int64  
 2   C1                int64  
 3   banner_pos        int64  
 4   site_id           object 
 5   site_domain       object 
 6   site_category     object 
 7   app_id            object 
 8   app_domain        object 
 9   app_category      object 
 10  device_id         object 
 11  device_ip         object 
 12  device_model      object 
 13  device_type       int64  
 14  device_conn_type  int64  
 15  C14               int64  
 16  C15               int64  
 17  C16               int64  
 18  C17               int64  
 19  C18               int64  
 20  C19               int64  
 21  C20               int64  
 22  C21               int64  
dtypes: float64(1), int64(13), object(9)
memory usage: 6.9+ GB


In [30]:
X_reduce = X[['hour', 'C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']]
# NOTE: missing 'device_model_int'

In [31]:
# Define training and testing
X_train, X_valid, y_train, y_valid = \
	train_test_split(X_reduce, y, test_size = .2, random_state = 0)

In [32]:
X_valid.head()

Unnamed: 0,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
8551471,14102217,1005,1,1,0,17894,320,50,2039,2,39,100075,32
39634055,14103017,1010,1,4,2,23642,320,50,2709,3,39,-1,23
22377501,14102613,1005,1,1,0,22268,320,50,2566,3,41,100156,42
14689278,14102409,1005,1,1,0,21875,320,50,2526,0,167,100075,221
32175767,14102822,1005,1,1,0,17262,320,50,1872,3,39,-1,23


In [None]:
# Create decision tree classifier
clf = DecisionTreeClassifier()

# Train classifier - predict label and evaluate accuracy
y_pred = clf.fit(X_train, y_train).predict(X_valid)
print(accuracy_score(y_valid, y_pred))

0.8312990659915402


In [34]:
y_score = clf.fit(X_train, y_train).predict_proba(X_valid)

In [37]:
# Get ROC curve metrics
fpr, tpr, thresholds = roc_curve(y_valid, y_score[:, 1])
roc_auc = auc(fpr, tpr)
print(roc_auc)

0.7068877963108704
