In [1]:
import pandas as pd
import numpy as np
import os
import boto3 

In [2]:
df = pd.read_csv('feature_eng/train_dataset2.csv')

In [3]:
x = df.iloc[:,1:-2]
y = df.iloc[:,-2:]

In [4]:
x = x.drop(['most_freq_os','most_freq_country','purchase_per_active_day','purchase_per_day','most_frequent_purchase_dow'], axis=1)

In [5]:
y_7_days = y.user_purchase_binary_7_days
y_14_days = y.user_purchase_binary_14_days

In [6]:
y_14_days.value_counts()

0.0    613607
1.0      6099
Name: user_purchase_binary_14_days, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
y_train_7_days = y_train.user_purchase_binary_7_days
y_train_14_days = y_train.user_purchase_binary_14_days
y_valid_7_days = y_valid.iloc[:,:1]
y_valid_14_days = y_valid.iloc[:,-1]

In [10]:
import xgboost as xgb

In [11]:
# model for 7 days
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe_lgbm = Pipeline([('xgbm', xgb.XGBClassifier(random_state = 42,n_jobs =-1, verbose=3))]) 

grid_params = { 'xgbm__learning_rate': [0.005, 0.01],'xgbm__n_estimators': [50,100],
                'xgbm__num_leaves': [10,31,50], 'xgbm__class_weight':['balanced', None]}
    
gridSearch = GridSearchCV(estimator=pipe_lgbm,param_grid=grid_params, cv = 5,  scoring='roc_auc')

gridSearch.fit(x_train.values, y_train_7_days.values.ravel())
best_params = gridSearch.best_params_
print(best_params)
print(gridSearch.best_score_)

KeyboardInterrupt: 

In [11]:
bucket_name = 'advanced-ml-project' # Add your bucket name
file_name = 'test_feature2.csv'
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket_name, Key=file_name) # S3 uses key-value structure where key is your file name
test = pd.read_csv(obj['Body'])

In [12]:
user = test.user_id_hash
x_test = test.iloc[:,1:]

In [13]:
x_test = x_test.drop(['most_freq_os','most_freq_country','purchase_per_active_day','purchase_per_day','most_frequent_purchase_dow'], axis=1)

In [14]:
xgb_7days = xgb.XGBClassifier(n_jobs =-1, verbose=3, class_weight='balanced', 
                           learning_rate=0.01, n_estimators=100, num_leaves=40)
xgb_7days.fit(x.values,y_7_days.values.ravel())
y_7_days_pred = xgb_7days.predict_proba(x_test.values)

In [15]:
xgb_14days = xgb.XGBClassifier(n_jobs =-1, verbose=3, class_weight='balanced', 
                           learning_rate=0.01, n_estimators=100, num_leaves=40)
xgb_14days.fit(x.values,y_14_days.values.ravel())
y_14_days_pred = xgb_14days.predict_proba(x_test.values)

In [16]:
d = {'user_id_hash': user, 'user_purchase_binary_7_days': y_7_days_pred[:,-1], 'user_purchase_binary_14_days': y_14_days_pred[:,-1]}

In [17]:
predictions = pd.DataFrame(data=d)

In [18]:
predictions.head()

Unnamed: 0,user_id_hash,user_purchase_binary_14_days,user_purchase_binary_7_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.184326,0.184296
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.184265,0.184296
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.184326,0.184296
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.186634,0.18515
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.184265,0.184296


In [19]:
xgb_7days.feature_importances_

array([0.00142857, 0.        , 0.13428572, 0.00428571, 0.00714286,
       0.00714286, 0.        , 0.        , 0.        , 0.38857144,
       0.        , 0.        , 0.32      , 0.03714286, 0.1       ],
      dtype=float32)

In [20]:
x.columns

Index(['most_freq_dow', 'most_freq_hour', 'median_create_session_time',
       'session_count', 'session_durations', 'mean_session_durations',
       'mean_session_interval', 'most_freq_region', 'most_freq_city',
       'purchase_counts', 'most_frequent_purchase_hour', 'purchase_perc',
       'total_amount', 'session_per_active_day', 'session_per_day'],
      dtype='object')

In [21]:
predictions.to_csv('submission.csv', index=False)