In [1]:
import pandas as pd
import numpy as np
import copy
from scipy.special import softmax
import DataLoad as data
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, accuracy_score

In [2]:
def split_data(sessions, day_to_split= '2020-3-12'):
    test = sessions_df.loc[sessions_df['timestamp'] > '2020-3-12']
    train = sessions_df.loc[sessions_df['timestamp'] <= '2020-3-12']
    return train, test

In [3]:
sessions_df = data.load_sessions_data()
train, test = split_data(sessions_df)

In [4]:
users_df = data.load_users_data()
products_df = data.load_products_data()

In [5]:
class ClassificationModel:
    
    def __init__(self, n_estimators=20, max_depth=5, random_state=0):
        self.n_estimators= n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.clf = RandomForestClassifier(n_estimators = self.n_estimators, 
                                          max_depth = self.max_depth, 
                                          random_state = self.random_state)
        
    def train(self, users_df, sessions, products):
        users = copy.deepcopy(users_df)
        users = data.favourite_products(users, sessions, products)
        users = data.spendings(users, sessions, products)
        users = data.discounts_stats(users, sessions)
        users = data.discounts_label(users, sessions)
        users = users.set_index('user_id')
        users = users.drop(['name', 'city', 'street',], axis=1)
        users = users.fillna(0)
        y_train = users['label']
        X_train = users.drop('label', axis=1)
        self.clf.fit(X_train, y_train)
        
    def predict(self, users_df, sessions, products):
        users = copy.deepcopy(users_df)
        users = data.favourite_products(users, sessions, products)
        users = data.spendings(users, sessions, products)
        users = data.discounts_stats(users, sessions)
        users = users.set_index('user_id')
        users = users.drop(['name', 'city', 'street',], axis=1)
        users = users.fillna(0)
        return self.clf.predict(users)

In [6]:
model = ClassificationModel()

In [7]:
model.train(users_df, train, products_df)

In [8]:
model.predict(users_df, test, products_df)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)

In [10]:
pd.options.mode.chained_assignment = None
cleared_sessions_df = train.loc[ (~sessions_df['user_id'].isna()) & (~sessions_df['product_id'].isna())]
cleared_test_sessions_df = test.loc[ (~sessions_df['user_id'].isna()) & (~sessions_df['product_id'].isna())]

In [66]:
class RegressionModel:
    def __init__(self, classification_model, train_sessions):
        self.reg_list = []
        self.selected_users = []
        self.classification_model = classification_model
        self.train_sessions = train_sessions

    def get_buys(self, uid):
        drop_list = []
        tmp = cleared_sessions_df.loc[(cleared_sessions_df['user_id'] == uid) & (cleared_sessions_df['event_type_BUY_PRODUCT'] == 1)]
        i = -1
        for index, row in tmp.iterrows():
            if index in drop_list:
                continue
            i = i + 1
            count = 1
            sum = row['offered_discount']
            j = i
            tmp2 = tmp[(i + 1):]
            for index2, row2 in tmp2.iterrows():
                j = j + 1
                if row['product_id'] == row2['product_id']:
                    count = count + 1
                    sum = sum + row2['offered_discount']
                    tmp.drop(tmp.index[j], inplace=True)
                    j = j - 1
                    drop_list.append(index2)
            if count > 1:
                 tmp.at[index, 'offered_discount'] = sum / count

        return tmp

    def get_product_category(self, pid):
        tmp = products_df.loc[((products_df['product_id']).isin(pid))]
        tmp = tmp.drop(['product_name', 'price', 'product_id'], axis=1)
        return tmp


    def set_selected_users(self):
        i = -1
        selection = self.classification_model.predict(users_df, test, products_df)
        for user in users_df['user_id']:
            i = i + 1
            if selection[i] == 1:
                self.selected_users.append(user)


    def train(self):
        for id in self.selected_users:
            reg = linear_model.LinearRegression()
            user_buys = self.get_buys(id)
            if len(user_buys.index) > 0:
                reg.fit(model2.get_product_category(user_buys['product_id']), user_buys['offered_discount'])
                self.reg_list.append([id, reg])
            else:
                self.reg_list.append([id, 0])

    def find(self, id):
        i = 0
        for user in self.selected_users:
            if id == user:
                return i
            i = i + 1
        return -1


    def predict(self, index):
        #row = sessions_df.loc[sessions_df.index == index]
        row = cleared_test_sessions_df.loc[cleared_test_sessions_df.index == index]
        id = self.find(int(row['user_id']))
        if id == -1:
            return 0
        if self.reg_list[id][1] == 0:
            return 0
        return self.reg_list[id][1].predict(self.get_product_category([row['product_id']]))
    
    def predict2(self, user_id, product_id):
        id = self.find(user_id)
        if id == -1:
            return 0
        if self.reg_list[id][1] == 0:
            return 0
        return self.reg_list[id][1].predict(self.get_product_category([product_id])) 


In [67]:
model2 = RegressionModel(model, cleared_sessions_df)

In [68]:
model2.set_selected_users()
model2.train()

In [72]:
model2.predict(37)

array([15.73333333])

In [73]:
model2.predict2(102, 1040)

array([15.73333333])

In [64]:
sessions_df.loc[sesions]

Unnamed: 0,session_id,timestamp,user_id,product_id,offered_discount,purchase_id,event_type_BUY_PRODUCT,event_type_VIEW_PRODUCT
0,100001,2020-01-15 13:44:51,102.0,1017.0,0,,0,1
1,100001,2020-01-15 13:48:01,102.0,1025.0,0,,0,1
2,100001,2020-01-15 13:49:47,102.0,1030.0,0,,0,1
3,100001,2020-01-15 13:50:34,102.0,1032.0,0,,0,1
4,100001,2020-01-15 13:52:02,102.0,1033.0,0,,0,1
...,...,...,...,...,...,...,...,...
55105,110273,2020-02-17 03:13:41,301.0,1012.0,15,,0,1
55106,110273,2020-02-17 03:17:18,301.0,1013.0,15,,0,1
55107,110274,2020-02-29 22:45:54,,1001.0,0,,0,1
55108,110275,2020-01-24 18:01:37,301.0,1283.0,0,,0,1


Test

In [22]:
len(cleared_test_sessions_df.index.tolist())

10082

In [52]:
class BaseModel:
    def __init__(self):
        self.discounts = {}
        
    def train(self, data, sessions):
        for i in data['product_id'].unique():
            if np.isfinite(i):
                discounts = sessions.loc[(sessions['product_id'] == i) 
                                                          & (sessions['event_type_BUY_PRODUCT'] == 1) ]['offered_discount']
                if len(discounts) != 0:
                    self.discounts[i] = np.average(discounts)
                else:
                    self.discounts[i] = np.random.randint(21)
                
    def predict(self, product_id):
        try:
            return self.discounts[product_id]
        except:
            return 0

In [53]:
base_model = BaseModel()

In [54]:
base_model.train(products_df, cleared_sessions_df)

In [57]:
score = 0.0
base_score = 0.0
for i in cleared_test_sessions_df.index.tolist():
    pred = model2.predict(i)
    base_pred = base_model.predict(cleared_test_sessions_df.loc[cleared_test_sessions_df.index == i]['product_id'].tolist()[0])
    if pred >= cleared_test_sessions_df.loc[cleared_test_sessions_df.index == i]['offered_discount'].tolist()[0]:
        score += 1 * (100 - pred)/100
    if base_pred >= cleared_test_sessions_df.loc[cleared_test_sessions_df.index == i]['offered_discount'].tolist()[0]:
        base_score += 1 * (100 - base_pred)/100

In [58]:
print(score)
print(base_score)

[7208.92184158]
7015.5522048573575


In [28]:
cleared_test_sessions_df.index

Int64Index([   37,    38,    39,    40,    41,    42,    43,    44,    45,
               46,
            ...
            55087, 55088, 55089, 55090, 55091, 55092, 55093, 55094, 55095,
            55109],
           dtype='int64', length=10082)

In [40]:
cleared_test_sessions_df.loc[cleared_test_sessions_df.index == 37]['offered_discount'].tolist()[0]

15