In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.grid_search import GridSearchCV

from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split

In [3]:
def encode_rows(row):
    out = []
    for item in row:
        if isinstance(item, str) and (item.find('type') >= 0):
            out.append(int(item.split('type ')[-1]))
        elif isinstance(item, str) and (item.find('group') >= 0):
            out.append(int(item.split('group ')[-1]))
        elif isinstance(item, np.bool_):
            if item:
                out.append(1)
            else:
                out.append(0)
        else:
            out.append(item)
            
    return out

In [4]:
raw_people = pd.read_csv('people.csv.zip', parse_dates=['date'])
raw_people.iloc[0]
type_cols = ['char_' + str(i+1) for i in range(9)]

In [5]:
people = pd.read_csv('people.csv.zip', parse_dates=['date'])
people = people.apply(encode_rows)
people['year'] = people['date'].dt.year
people['month'] = people['date'].dt.month
people['day'] = people['date'].dt.day
people['isweekend'] = (people['date'].dt.weekday >= 5).astype(int)
people = people.drop('date', axis = 1)
people.head()

Unnamed: 0,people_id,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,...,char_33,char_34,char_35,char_36,char_37,char_38,year,month,day,isweekend
0,ppl_100,2,17304,2,5,5,5,3,11,2,...,0,1,1,1,0,36,2021,6,29,0
1,ppl_100002,2,8688,3,28,9,5,3,11,2,...,1,1,1,1,0,76,2021,1,6,0
2,ppl_100003,2,33592,3,4,8,5,2,5,2,...,1,1,0,1,1,99,2022,6,10,0
3,ppl_100004,2,22593,3,40,25,9,4,16,2,...,1,1,1,1,1,76,2022,7,20,0
4,ppl_100006,2,6534,3,40,25,9,3,8,2,...,0,0,1,1,0,84,2022,7,27,0


In [6]:
act_test = pd.read_csv('act_test.csv.zip', parse_dates=['date'])
act_test = act_test.apply(encode_rows)
act_test['year'] = act_test['date'].dt.year
act_test['month'] = act_test['date'].dt.month
act_test['day'] = act_test['date'].dt.day
act_test['isweekend'] = (act_test['date'].dt.weekday >= 5).astype(int)
act_test = act_test.drop('date', axis=1)
act_test.head()

Unnamed: 0,people_id,activity_id,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,year,month,day,isweekend
0,ppl_100004,act1_249281,1,5.0,10.0,5.0,1.0,6.0,1.0,1.0,7.0,4.0,,2022,7,20,0
1,ppl_100004,act2_230855,5,,,,,,,,,,682.0,2022,7,20,0
2,ppl_10001,act1_240724,1,12.0,1.0,5.0,4.0,6.0,1.0,1.0,13.0,10.0,,2022,10,14,0
3,ppl_10001,act1_83552,1,20.0,10.0,5.0,4.0,6.0,1.0,1.0,5.0,5.0,,2022,11,27,1
4,ppl_10001,act2_1043301,5,,,,,,,,,,3015.0,2022,10,15,1


In [7]:
act_train = pd.read_csv('act_train.csv.zip', parse_dates=['date'])
act_train = act_train.apply(encode_rows)
act_train['year'] = act_train['date'].dt.year
act_train['month'] = act_train['date'].dt.month
act_train['day'] = act_train['date'].dt.day
act_train['isweekend'] = (act_train['date'].dt.weekday >= 5).astype(int)
act_train = act_train.drop('date', axis=1)
act_train.head()

Unnamed: 0,people_id,activity_id,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome,year,month,day,isweekend
0,ppl_100,act2_1734928,4,,,,,,,,,,76.0,0,2023,8,26,1
1,ppl_100,act2_2434093,2,,,,,,,,,,1.0,0,2022,9,27,0
2,ppl_100,act2_3404049,2,,,,,,,,,,1.0,0,2022,9,27,0
3,ppl_100,act2_3651215,2,,,,,,,,,,1.0,0,2023,8,4,0
4,ppl_100,act2_4109017,2,,,,,,,,,,1.0,0,2023,8,26,1


In [8]:

act_type_1_test = act_test[act_test.activity_category == 1]
act_type_1_train = act_train[act_train.activity_category == 1]

act_type_1_test = act_type_1_test.drop('char_10', axis=1)
act_type_1_train = act_type_1_train.drop('char_10', axis=1)

act_type_1plus_test = act_test[act_test.activity_category > 1]
act_type_1plus_train = act_train[act_train.activity_category > 1]

drop = ['char_' + str(i+1) for i in range(9)]
act_type_1plus_test = act_type_1plus_test.drop(drop, axis=1)
act_type_1plus_train = act_type_1plus_train.drop(drop, axis=1)

In [9]:
act_type_1_test.groupby('activity_category').count()

Unnamed: 0_level_0,people_id,activity_id,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,year,month,day,isweekend
activity_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,40092,40092,40092,40092,40092,40092,40092,40092,40092,40092,40092,40092,40092,40092,40092


In [10]:
act_type_1plus_test.groupby('activity_category').count()

Unnamed: 0_level_0,people_id,activity_id,char_10,year,month,day,isweekend
activity_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,223164,223164,223164,223164,223164,223164,223164
3,59931,59931,59931,59931,59931,59931,59931
4,50215,50215,50215,50215,50215,50215,50215
5,123463,123463,123463,123463,123463,123463,123463
6,1051,1051,1051,1051,1051,1051,1051
7,771,771,771,771,771,771,771


In [11]:
people_act_type_1_train = pd.merge(people, act_type_1_train, on = 'people_id')
people_act_type_1_test = pd.merge(people, act_type_1_test, on = 'people_id')

outcome = people_act_type_1_train.outcome
people_ids_train = people_act_type_1_train.people_id
peopld_ids_test = people_act_type_1_test.people_id
activity_ids_train = people_act_type_1_train.activity_id
activity_ids_test = people_act_type_1_test.activity_id

people_act_type_1_test = people_act_type_1_test.drop(['people_id', 'group_1', 'activity_id', 'char_10'], axis=1)
people_act_type_1_train = people_act_type_1_train.drop(['people_id', 'group_1', 'activity_id', 'char_10', 'outcome'], axis=1)

people_act_type_1_train.head()

Unnamed: 0,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,char_9_x,char_11,...,char_4_y,char_5_y,char_6_y,char_7_y,char_8_y,char_9_y,year_y,month_y,day_y,isweekend_y
0,2,3,14,6,8,3,9,6,6,0,...,1.0,6.0,3.0,3.0,6.0,8.0,2022,11,25,0
1,2,2,10,7,6,3,9,3,3,0,...,1.0,6.0,1.0,1.0,4.0,1.0,2022,7,26,0
2,2,2,10,7,6,3,9,3,3,0,...,3.0,1.0,3.0,4.0,5.0,1.0,2023,6,15,0
3,2,2,10,7,6,3,9,3,3,0,...,3.0,5.0,2.0,2.0,4.0,2.0,2023,2,28,0
4,2,2,10,7,6,3,9,3,3,0,...,2.0,6.0,1.0,1.0,6.0,8.0,2022,7,26,0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(people_act_type_1_train, outcome)

In [18]:

gbc = RandomForestClassifier()
gbc.fit(X_train, y_train)
score = gbc.score(X_test, y_test)
y_pred = gbc.predict(X_test)
gbc_roc_auc_score = roc_auc_score(y_test, y_pred)

print(score, gbc_roc_auc_score)

0.9218607248 0.922506667806


In [14]:
np.shape(X_test)

(39404, 55)

In [45]:
np.shape(people_act_type_1plus_test)

(458595, 48)

In [53]:
y_pred_test = gbc.predict(people_act_type_1_test)

In [32]:
np.shape(people_act_type_1_test)

(40092, 55)

In [34]:
np.shape(activity_ids_test)

(40092,)

In [33]:
np.shape(people_act_type_1plus_test)

(458595, 48)

In [35]:
np.shape(activity_ids_test_plus)

(458595,)

In [36]:
np.shape(act_test)

(498687, 17)

In [26]:
people_act_type_1plus_train = pd.merge(people, act_type_1plus_train, on = 'people_id')
people_act_type_1plus_test = pd.merge(people, act_type_1plus_test, on = 'people_id')

outcome = people_act_type_1plus_train.outcome
people_ids_train = people_act_type_1plus_train.people_id
peopld_ids_test = people_act_type_1plus_test.people_id
activity_ids_train = people_act_type_1plus_train.activity_id
activity_ids_test_plus = people_act_type_1plus_test.activity_id

people_act_type_1plus_test = people_act_type_1plus_test.drop(['people_id', 'group_1', 'activity_id'], axis=1)
people_act_type_1plus_train = people_act_type_1plus_train.drop(['people_id', 'group_1', 'activity_id', 'outcome'], axis=1)

people_act_type_1plus_train.head()

Unnamed: 0,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10_x,...,year_x,month_x,day_x,isweekend_x,activity_category,char_10_y,year_y,month_y,day_y,isweekend_y
0,2,2,5,5,5,3,11,2,2,1,...,2021,6,29,0,4,76.0,2023,8,26,1
1,2,2,5,5,5,3,11,2,2,1,...,2021,6,29,0,2,1.0,2022,9,27,0
2,2,2,5,5,5,3,11,2,2,1,...,2021,6,29,0,2,1.0,2022,9,27,0
3,2,2,5,5,5,3,11,2,2,1,...,2021,6,29,0,2,1.0,2023,8,4,0
4,2,2,5,5,5,3,11,2,2,1,...,2021,6,29,0,2,1.0,2023,8,26,1


In [19]:
X_train, X_test, y_train, y_test = train_test_split(people_act_type_1plus_train, outcome)

In [20]:

gbcplus = RandomForestClassifier()
gbcplus.fit(X_train, y_train)
score = gbcplus.score(X_test, y_test)
y_pred = gbcplus.predict(X_test)
gbc_roc_auc_score = roc_auc_score(y_test, y_pred)

print(score, gbc_roc_auc_score)

0.989531670716 0.989636463077


In [22]:
y_pred_test_plus = gbcplus.predict(people_act_type_1plus_test)

In [23]:
sample = pd.read_csv('sample_submission.csv.zip')
sample.head()

Unnamed: 0,activity_id,outcome
0,act1_1,0
1,act1_100006,0
2,act1_100050,0
3,act1_100065,0
4,act1_100068,0


In [55]:
out = {'activity_id': activity_ids_test, 'outcome': y_pred_test}
outplus = {'activity_id': activity_ids_test_plus, 'outcome': y_pred_test_plus}
df_out = pd.DataFrame(out)
df_out_plus = pd.DataFrame(outplus)

df_out = pd.concat([df_out, df_out_plus])
df_out.head()

Unnamed: 0,activity_id,outcome
0,act1_249281,1
1,act1_240724,1
2,act1_83552,1
3,act1_218751,0
4,act1_383524,0


In [56]:
df_out.to_csv('submission_2.csv', index=False)

In [39]:
df_out.groupby('activity_id').count().head()

Unnamed: 0_level_0,outcome
activity_id,Unnamed: 1_level_1
act1_1,1
act1_100006,1
act1_100050,1
act1_100065,1
act1_100068,1


In [33]:
ohe = OneHo`tEncoder()
people_act_type_1_train_enc = ohe.fit_transform(people_act_type_1_train)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(people_act_type_1_train_enc, outcome)

In [35]:
params = {'learning_rate': 1.0, 'n_estimators': 2000}
gbc = GradientBoostingClassifier(learning_rate = 1.0, n_estimators = 2000)
gbc.fit(X_train, y_train)
score = gbc.score(X_test, y_test)
y_pred = gbc.predict(X_test)
gbc_roc_auc_score = roc_auc_score(y_test, y_pred)

print(score, gbc_roc_auc_score)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [43]:

score = gbc.score(X_test.todense(), y_test)
y_pred = gbc.predict(X_test.todense())
gbc_roc_auc_score = roc_auc_score(y_test, y_pred)

print(score, gbc_roc_auc_score)

0.900999898487 0.90402429053


In [44]:
from sklearn.metrics import confusion_matrix

In [46]:
confusion_matrix(y_test, y_pred)

array([[20501,  2622],
       [ 1279, 15002]])

In [111]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
score = gbc.score(X_test, y_test)
y_pred = gbc.predict(X_test)
gbc_roc_auc_score = roc_auc_score(y_test, y_pred)

print(score, gbc_roc_auc_score)

0.862831184651 0.87759097279


In [122]:
param_grid = {
    'learning_rate': [0.01, 0.1, 1.0],
    'n_estimators': [10, 100, 500],
    #'subsample': [0.5, 0.75, 1.0],
    #'min_samples_split': [2, 4],
    #'min_samples_leaf': [1, 4],
    #'min_weight_fraction_left': [0.0, 0.5, 1.0],
    #'max_depth': [2, 3, 4]
}

In [124]:
gbc = GradientBoostingClassifier()
gs = GridSearchCV(gbc, param_grid)
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.01, 0.1, 1.0], 'n_estimators': [10, 100, 500]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [125]:
gs.score(X_test, y_test)

0.89201603898081416

In [126]:
y_pred = gs.predict(X_test)
roc_auc_score(y_test, y_pred)

0.8977827947575997

In [127]:
gs.best_params_

{'learning_rate': 1.0, 'n_estimators': 500}

In [128]:
param_grid = {
    'learning_rate': [1.0, 2.0, 5.0],
    'n_estimators': [500, 1000, 2000]
}
gbc = GradientBoostingClassifier()
gs = GridSearchCV(gbc, param_grid)
gs.fit(X_train, y_train)
score = gs.score(X_test, y_test)
print(score)
y_pred = gs.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print(score)
gs.best_params_

0.904679727946
0.907559828349


{'learning_rate': 1.0, 'n_estimators': 2000}

In [1]:
param_grid = {
    'max_depth': [3, 5, 7],
    'subsample': [0.5, 0.75, 1.0]
}
gbc = GradientBoostingClassifier(n_estimators = 2000, learning_rate = 1.0)
gs = GridSearchCV(gbc, param_grid)
gs.fit(X_train, y_train)
score = gs.score(X_test, y_test)
print(score)
y_pred = gs.predict(X_test)
score = roc_auc_score(y_test, y_pred)
print(score)
gs.best_params_

NameError: name 'GradientBoostingClassifier' is not defined

In [None]:
i=0