In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
import boto3 

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Import Data

In [3]:
bucket_name = 'advanced-ml-project' # Add your bucket name
file_name = 'train_dataset.csv'
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket_name, Key=file_name) # S3 uses key-value structure where key is your file name
df = pd.read_csv(obj['Body'])

In [4]:
bucket_name = 'advanced-ml-project' # Add your bucket name
file_name = 'test_feature.csv'
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket_name, Key=file_name) # S3 uses key-value structure where key is your file name
test = pd.read_csv(obj['Body'])

In [8]:
x = df.iloc[:,1:-2]

In [9]:
y = df.iloc[:,-2:]

In [80]:
y_7_days = y.user_purchase_binary_7_days
y_14_days = y.user_purchase_binary_14_days

In [81]:
y_7_days.value_counts()

0.0    614996
1.0      4710
Name: user_purchase_binary_7_days, dtype: int64

In [82]:
y_14_days.value_counts()

0.0    613607
1.0      6099
Name: user_purchase_binary_14_days, dtype: int64

In [85]:
user = test.user_id_hash
x_test = test.iloc[:,1:]

In [87]:
len(user)

312568

In [88]:
len(user.unique())

312568

### Train validation split

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
len(x_train)

495764

In [12]:
len(x_valid)

123942

In [70]:
y_train_7_days = y_train.user_purchase_binary_7_days
y_train_14_days = y_train.user_purchase_binary_14_days

In [61]:
y_valid_7_days = y_valid.iloc[:,:1]
y_valid_14_days = y_valid.iloc[:,-1]

In [72]:
y_train_7_days.value_counts()

0.0    491964
1.0      3800
Name: user_purchase_binary_7_days, dtype: int64

In [73]:
y_train_14_days.value_counts()

0.0    490817
1.0      4947
Name: user_purchase_binary_14_days, dtype: int64

In [75]:
y_valid_7_days.user_purchase_binary_7_days.value_counts()

0.0    123032
1.0       910
Name: user_purchase_binary_7_days, dtype: int64

In [76]:
y_valid_14_days.value_counts()

0.0    122790
1.0      1152
Name: user_purchase_binary_14_days, dtype: int64

### Tune hyperparameter

In [36]:
# model for 7 days
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe_lgbm = Pipeline([('lgbm', lgb.LGBMClassifier(random_state = 42,n_jobs =-1, verbose=3))]) 

grid_params = { 'lgbm__learning_rate': [0.005, 0.01],'lgbm__n_estimators': [50,100],
                'lgbm__num_leaves': [31,50,100], 'lgbm__class_weight':['balanced', None]}
    
gridSearch = GridSearchCV(estimator=pipe_lgbm,param_grid=grid_params, cv = 5,  scoring='roc_auc')

gridSearch.fit(x_train.values, y_train_7_days.values.ravel())
best_params = gridSearch.best_params_
print(best_params)
print(gridSearch.best_score_)

{'lgbm__class_weight': 'balanced', 'lgbm__learning_rate': 0.01, 'lgbm__n_estimators': 100, 'lgbm__num_leaves': 31}
0.9428930949933736


In [41]:
from sklearn.metrics import roc_auc_score

In [40]:
model = lgb.LGBMClassifier(random_state = 42,n_jobs =-1, verbose=3, class_weight='balanced', 
                           learning_rate=0.01, n_estimators=100, num_leaves=31)
model.fit(x_train.values,y_train_7_days.values.ravel())
valid_pred = model.predict_proba(x_valid)

In [51]:
roc_auc_score(y_valid_7_days,valid_pred[:,-1])

0.9425025536106393

In [77]:
# model for 14 days
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe_lgbm = Pipeline([('lgbm', lgb.LGBMClassifier(random_state = 42,n_jobs =-1, verbose=3))]) 

grid_params = { 'lgbm__learning_rate': [0.005, 0.01],'lgbm__n_estimators': [50,100],
                'lgbm__num_leaves': [31,50,100], 'lgbm__class_weight':['balanced', None]}
    
gridSearch = GridSearchCV(estimator=pipe_lgbm,param_grid=grid_params, cv = 5,  scoring='roc_auc')

gridSearch.fit(x_train.values, y_train_14_days)
best_params = gridSearch.best_params_
print(best_params)
print(gridSearch.best_score_)

{'lgbm__class_weight': 'balanced', 'lgbm__learning_rate': 0.01, 'lgbm__n_estimators': 100, 'lgbm__num_leaves': 50}
0.936600317841399


In [79]:
model2 = lgb.LGBMClassifier(random_state = 42,n_jobs =-1, verbose=3, class_weight='balanced', 
                           learning_rate=0.01, n_estimators=100, num_leaves=50)
model2.fit(x_train.values,y_train_14_days.values.ravel())
valid_pred_2 = model2.predict_proba(x_valid)
roc_auc_score(y_valid_14_days,valid_pred_2[:,-1])

0.9375019900451085

### Fit all train data and predict test

In [89]:
# 7 days
lgbm_7days = lgb.LGBMClassifier(random_state = 42,n_jobs =-1, verbose=3, class_weight='balanced', 
                                learning_rate=0.01, n_estimators=100, num_leaves=31)
lgbm_7days.fit(x.values,y_7_days.values.ravel())
y_7_days_pred = lgbm_7days.predict_proba(x_test)

In [90]:
# 14 days
lgbm_14days = lgb.LGBMClassifier(random_state = 42,n_jobs =-1, verbose=3, class_weight='balanced', 
                                learning_rate=0.01, n_estimators=100, num_leaves=50)
lgbm_14days.fit(x.values,y_14_days.values.ravel())
y_14_days_pred = lgbm_14days.predict_proba(x_test)

In [104]:
d = {'user_id_hash': user, 'user_purchase_binary_7_days': y_7_days_pred[:,-1], 'user_purchase_binary_14_days': y_14_days_pred[:,-1]}

In [105]:
predictions = pd.DataFrame(data=d)

In [107]:
predictions.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.210365,0.22587
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.186226,0.188047
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.225233,0.22319
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.391945,0.379102
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.186226,0.188047


In [108]:
predictions.to_csv('submission.csv', index=False)