In [1]:
import pandas as pd
import numpy as np
import copy
from scipy.special import softmax
import DataLoad as data
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

In [2]:
def split_data(sessions, day_to_split= '2020-3-12'):
    test = sessions_df.loc[sessions_df['timestamp'] > '2020-3-12']
    train = sessions_df.loc[sessions_df['timestamp'] <= '2020-3-12']
    return train, test

In [3]:
sessions_df = data.load_sessions_data()
train, test = split_data(sessions_df)

In [4]:
users_df = data.load_users_data()
products_df = data.load_products_data()

In [5]:
class ClassificationModel:
    
    def __init__(self, n_estimators=20, max_depth=5, random_state=0):
        self.n_estimators= n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.clf = RandomForestClassifier(n_estimators = self.n_estimators, 
                                          max_depth = self.max_depth, 
                                          random_state = self.random_state)
        
    def train(self, users_df, sessions, products):
        users = copy.deepcopy(users_df)
        users = data.favourite_products(users, sessions, products)
        users = data.spendings(users, sessions, products)
        users = data.discounts_stats(users, sessions)
        users = data.discounts_label(users, sessions)
        users = users.set_index('user_id')
        users = users.drop(['name', 'city', 'street',], axis=1)
        users = users.fillna(0)
        y_train = users['label']
        X_train = users.drop('label', axis=1)
        self.clf.fit(X_train, y_train)
        
    def predict(self, users_df, sessions, products):
        users = copy.deepcopy(users_df)
        users = data.favourite_products(users, sessions, products)
        users = data.spendings(users, sessions, products)
        users = data.discounts_stats(users, sessions)
        users = users.set_index('user_id')
        users = users.drop(['name', 'city', 'street',], axis=1)
        users = users.fillna(0)
        return self.clf.predict(users)

In [6]:
model = ClassificationModel()

In [7]:
model.train(users_df, train, products_df)

In [8]:
model.predict(users_df, test, products_df)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)