In [129]:
import pandas as pd
import numpy as np 
%matplotlib inline
from datetime import datetime
from typing import List, Any

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.feature_extraction import FeatureHasher
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [3]:
df_train = pd.read_csv('../avazu-ctr-prediction/train')

In [32]:
df_train_partial = df_train[df_train.index % 100 == 0]

In [35]:
df_train_partial.to_csv('../avazu-ctr-prediction/train_partial', index=False)

In [111]:
df_test = pd.read_csv('../avazu-ctr-prediction/test')

In [74]:
pd.concat([df_train_partial.nunique().to_frame(), df_train_partial.dtypes.to_frame(), df_train_partial.isna().any().to_frame()], axis=1)

Unnamed: 0,0,0.1,0.2
id,404290,float64,False
click,2,int64,False
hour,240,int64,False
C1,7,int64,False
banner_pos,7,int64,False
site_id,2171,object,False
site_domain,2147,object,False
site_category,20,object,False
app_id,2245,object,False
app_domain,132,object,False


In [73]:
## hasing trick, train valid test split

In [115]:
df_train, df_valid = train_test_split(df_train_partial, train_size=0.8)

In [145]:
def preprocess(df: pd.DataFrame):
    df['hour'] = df['hour'].map(lambda x: datetime.strptime(str(x), "%y%m%d%H"))
    df['day_of_week'] = df['hour'].map(lambda x: x.hour)
    
    feature_hasher = FeatureHasher(n_features=2**24, input_type='string')
    hashed_feature = feature_hasher.fit_transform(np.asanyarray(df.astype(str)))
    
    return hashed_feature

In [146]:
feature_cols = ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
                            'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
                            'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
                            'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

target = 'click'


In [147]:
y_train = df_train[target].values
y_train = np.asarray(y_train).ravel()

X_train = df_train[feature_cols]
X_train_hashed = preprocess(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [148]:
model = SGDClassifier(loss='log', alpha=0.00001, penalty='l2', eta0=2.0, n_jobs=-1, random_state=42)
# model = LogisticRegression()
model.partial_fit(X_train_hashed, y_train, classes=[0, 1])


SGDClassifier(alpha=1e-05, eta0=2.0, loss='log', n_jobs=-1, random_state=42)

In [149]:
y_valid = df_valid[target]
y_valid = np.asarray(y_valid).ravel()

X_valid = df_valid[feature_cols]
X_valid_hashed = preprocess(X_valid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [156]:
y_pred = model.predict_proba(X_valid_hashed)

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000017e+19,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,8330,320,50,761,3,175,100075,23
1,1.000018e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
2,1.000055e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
3,1.000109e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,...,1,0,18648,320,50,1092,3,809,100156,61
4,1.000138e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,...,1,0,23160,320,50,2667,0,47,-1,221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4577459,9.998167e+18,14103123,1005,0,93eaba74,7687a86e,3e814130,ecad2386,7801e8d9,07d7df22,...,1,0,17654,300,250,1994,2,39,-1,33
4577460,9.998250e+18,14103123,1005,0,17d1b03f,f3845767,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,22104,320,50,2545,0,431,100084,221
4577461,9.998802e+16,14103123,1005,0,5b08c53b,7687a86e,3e814130,ecad2386,7801e8d9,07d7df22,...,1,0,17654,300,250,1994,2,39,-1,33
4577462,9.999087e+18,14103123,1005,0,85f751fd,c4e18dd6,50e219e0,92f5800b,ae637522,0f2161f8,...,1,3,23857,320,50,2734,1,175,100189,71
