In [31]:
import pandas as pd
import numpy as np
import copy
from scipy.special import softmax
import DataLoad as data
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

In [32]:
def split_data(sessions, day_to_split= '2020-3-12'):
    test = sessions_df.loc[sessions_df['timestamp'] > '2020-3-12']
    train = sessions_df.loc[sessions_df['timestamp'] <= '2020-3-12']
    return train, test

In [33]:
sessions_df = data.load_sessions_data()
train, test = split_data(sessions_df)

In [34]:
users_df = data.load_users_data()
products_df = data.load_products_data()

In [35]:
class ClassificationModel:
    
    def __init__(self, n_estimators=20, max_depth=5, random_state=0):
        self.n_estimators= n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.clf = RandomForestClassifier(n_estimators = self.n_estimators, 
                                          max_depth = self.max_depth, 
                                          random_state = self.random_state)
        
    def train(self, users_df, sessions, products):
        users = copy.deepcopy(users_df)
        users = data.favourite_products(users, sessions, products)
        users = data.spendings(users, sessions, products)
        users = data.discounts_stats(users, sessions)
        users = data.discounts_label(users, sessions)
        users = users.set_index('user_id')
        users = users.drop(['name', 'city', 'street',], axis=1)
        users = users.fillna(0)
        y_train = users['label']
        X_train = users.drop('label', axis=1)
        self.clf.fit(X_train, y_train)
        
    def predict(self, users_df, sessions, products):
        users = copy.deepcopy(users_df)
        users = data.favourite_products(users, sessions, products)
        users = data.spendings(users, sessions, products)
        users = data.discounts_stats(users, sessions)
        users = users.set_index('user_id')
        users = users.drop(['name', 'city', 'street',], axis=1)
        users = users.fillna(0)
        return self.clf.predict(users)

In [36]:
model = ClassificationModel()

In [37]:
model.train(users_df, train, products_df)

In [38]:
model.predict(users_df, test, products_df)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)

In [39]:
pd.options.mode.chained_assignment = None
cleared_sessions_df = train.loc[ (~sessions_df['user_id'].isna()) & (~sessions_df['product_id'].isna())]
cleared_test_sessions_df = test.loc[ (~sessions_df['user_id'].isna()) & (~sessions_df['product_id'].isna())]

In [40]:
class RegressionModel:
    def __init__(self, classification_model, train_sessions):
        self.reg_list = []
        self.selected_users = []
        self.classification_model = classification_model
        self.train_sessions = train_sessions

    def get_buys(self, uid):
        drop_list = []
        tmp = cleared_sessions_df.loc[(cleared_sessions_df['user_id'] == uid) & (cleared_sessions_df['event_type_BUY_PRODUCT'] == 1)]
        i = -1
        for index, row in tmp.iterrows():
            if index in drop_list:
                continue
            i = i + 1
            count = 1
            sum = row['offered_discount']
            j = i
            tmp2 = tmp[(i + 1):]
            for index2, row2 in tmp2.iterrows():
                j = j + 1
                if row['product_id'] == row2['product_id']:
                    count = count + 1
                    sum = sum + row2['offered_discount']
                    tmp.drop(tmp.index[j], inplace=True)
                    j = j - 1
                    drop_list.append(index2)
            if count > 1:
                 tmp.at[index, 'offered_discount'] = sum / count

        return tmp

    def get_product_category(self, pid):
        tmp = products_df.loc[((products_df['product_id']).isin(pid))]
        tmp = tmp.drop(['product_name', 'price', 'product_id'], axis=1)
        return tmp


    def set_selected_users(self):
        i = -1
        selection = self.classification_model.predict(users_df, test, products_df)
        for user in users_df['user_id']:
            i = i + 1
            if selection[i] == 1:
                self.selected_users.append(user)


    def train(self):
        for id in self.selected_users:
            reg = linear_model.LinearRegression()
            user_buys = self.get_buys(id)
            if len(user_buys.index) > 0:
                reg.fit(model2.get_product_category(user_buys['product_id']), user_buys['offered_discount'])
                self.reg_list.append([id, reg])
            else:
                self.reg_list.append([id, 0])



In [41]:
model2 = RegressionModel(model, cleared_sessions_df)

In [42]:
model2.set_selected_users()
model2.train()

In [47]:
discounts = []
last_id = ''
count = 0
for index, row in cleared_test_sessions_df.iterrows():
    if row['user_id'] not in model2.selected_users:
        if row['user_id'] != last_id:
            count = count + 1
        last_id = row['user_id']
        discounts.append(0)
        continue
    if model2.reg_list[int(row['user_id']) - 102 - count][1] == 0:
        discounts.append(0)
    else:
        discounts.append(model2.reg_list[int(row['user_id']) - 102 - count][1].predict(model2.get_product_category([row['product_id']])))
discounts

[array([15.73333333]),
 array([15.73333333]),
 array([15.73333333]),
 array([15.73333333]),
 array([15.73333333]),
 array([15.73333333]),
 array([15.73333333]),
 array([15.73333333]),
 array([15.73333333]),
 array([17.2]),
 array([15.]),
 array([15.]),
 array([15.]),
 array([15.]),
 array([15.]),
 array([15.]),
 array([15.]),
 array([15.]),
 array([18.13333333]),
 array([18.13333333]),
 array([18.13333333]),
 array([18.13333333]),
 array([18.13333333]),
 array([18.13333333]),
 array([18.13333333]),
 array([18.13333333]),
 array([18.13333333]),
 array([20.]),
 array([15.97544079]),
 array([15.97544079]),
 array([15.97544079]),
 array([15.97544079]),
 array([12.]),
 array([12.]),
 array([18.]),
 array([18.]),
 array([18.]),
 array([18.]),
 array([18.]),
 array([18.]),
 array([18.]),
 array([18.]),
 array([18.]),
 array([20.]),
 array([20.]),
 array([20.]),
 array([20.]),
 array([20.]),
 array([15.97544079]),
 array([19.36377102]),
 array([19.36377102]),
 array([19.36377102]),
 array([19.