# Purpose of this model

This is to calculate the correlation between customer features and the total sales they make.

# Import libraries

In [96]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import statsmodels.api as sm
from scipy import stats

from sklearn.metrics import mean_squared_error, mean_absolute_error

import os

In [97]:
pd.options.display.float_format = '{:.2f}'.format

In [98]:
loc = os.path.normpath(os.getcwd() + os.sep + os.pardir)
clean_data_loc = loc + r'\clean_data'

# Import dataset

In [99]:
profile_df = pd.read_csv(clean_data_loc + r"\\clean_profile.csv")

bins = [18, 30, 50, 65, np.inf]
labels = ["18-29", "30-49", "50-64", ">65"]

profile_df['age_group'] = pd.cut(profile_df['age'], bins=bins, right=False, labels=labels)
profile_df

Unnamed: 0,gender,age,id,became_member_on,income,age_group
0,F,55,0610b486422d4921ae7d2bf64640c50b,2017-07-15,112000.00,50-64
1,F,75,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,100000.00,>65
2,M,68,e2127556f4f64592b11af22de27a7932,2018-04-26,70000.00,>65
3,M,65,389bc3fa690240e798340f5a15918d5c,2018-02-09,53000.00,>65
4,M,58,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,51000.00,50-64
...,...,...,...,...,...,...
16995,F,43,5c686d09ca4d475a8f750f2ba07e0440,2016-09-01,69000.00,30-49
16996,F,64,d9ca82f550ac4ee58b6299cf1e5c824a,2016-04-15,60000.00,50-64
16997,M,68,ca45ee1883624304bac1e4c8a114f045,2018-03-05,59000.00,>65
16998,F,58,a9a20fa8b5504360beb4e7c8712f8306,2016-01-16,60000.00,50-64


In [100]:
# Create categorical codes
profile_df['age_group_code'] = profile_df['age_group'].cat.codes
profile_df['gender_code'] = pd.Categorical(profile_df['gender'], categories=['O', 'F', 'M']).codes
profile_df['income_code'] = pd.cut(profile_df['income'], bins=4).cat.codes
profile_df['loyalty_code'] = pd.cut(pd.to_datetime(profile_df['became_member_on']), bins=4).cat.codes

customer_df = profile_df[['id', 'age_group_code', 'gender_code', 'income_code', 'loyalty_code']]
customer_df.head()

Unnamed: 0,id,age_group_code,gender_code,income_code,loyalty_code
0,0610b486422d4921ae7d2bf64640c50b,2,1,3,3
1,78afa995795e4d85b5d9ceeca43f5fef,3,1,3,3
2,e2127556f4f64592b11af22de27a7932,3,2,1,3
3,389bc3fa690240e798340f5a15918d5c,3,2,1,3
4,2eeac8d8feae4a8cad5a6af0499a211d,2,2,0,3


In [101]:
transcript_df = pd.read_csv(clean_data_loc + r"\\clean_transcript.csv")

portfolio_df = pd.read_csv(clean_data_loc + r"\\clean_portfolio.csv")
portfolio_df['offer_name'] = ['offer {}'.format(i) for i in range(1,11)]

transcript_df = transcript_df.merge(portfolio_df, how='left', left_on='offer id', right_on='id')
transcript_df.drop(['id', 'offer id'], axis=1, inplace=True)
transcript_df.head()

Unnamed: 0,person,time,transaction,offer received,offer viewed,offer completed,amount,reward,difficulty,duration,offer_type,email,mobile,social,web,offer_name
0,78afa995795e4d85b5d9ceeca43f5fef,0,0,1,0,0,,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0,offer 4
1,a03223e636434f42ac4c3df47e8bac43,0,0,1,0,0,,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0,offer 5
2,e2127556f4f64592b11af22de27a7932,0,0,1,0,0,,2.0,10.0,7.0,discount,1.0,1.0,0.0,1.0,offer 10
3,8ec6ce2a7e7949b1bf142def7d0e0586,0,0,1,0,0,,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0,offer 7
4,68617ca6246f4fbc85e91a2a49552598,0,0,1,0,0,,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0,offer 2


In [102]:
sales_df = transcript_df.groupby('person')[['amount', 'offer received'] ].sum().reset_index()
sales_df

Unnamed: 0,person,amount,offer received
0,0009655768c64bdeb2e877511632db8f,127.60,5
1,00116118485d4dfda04fdbaba9a87b5c,4.09,2
2,0011e0d4e6b944f998e987f904e8c1e5,79.46,5
3,0020c2b971eb4e9188eac86d93036a77,196.86,5
4,0020ccbbb6d84e358d3414a3ff76cffd,154.05,4
...,...,...,...
16995,fff3ba4757bd42088c044ca26d73817a,580.98,6
16996,fff7576017104bcc8677a8d63322b5e1,29.94,5
16997,fff8957ea8b240a6b5e634b6ee8eafcf,12.15,3
16998,fffad4f4828548d1b5583907f2e9906b,88.83,4


In [103]:
sales_df = sales_df.merge(customer_df, how='left', left_on='person', right_on='id')
sales_df.drop(['person', 'id'], axis=1, inplace=True)
sales_df.head()

Unnamed: 0,amount,offer received,age_group_code,gender_code,income_code,loyalty_code
0,127.6,5,1,2,1,2
1,4.09,2,3,2,0,3
2,79.46,5,1,0,1,3
3,196.86,5,2,1,2,2
4,154.05,4,0,1,1,2


In [104]:
sales_df.corr()

Unnamed: 0,amount,offer received,age_group_code,gender_code,income_code,loyalty_code
amount,1.0,0.08,0.07,-0.13,0.29,-0.15
offer received,0.08,1.0,-0.01,0.01,-0.0,0.01
age_group_code,0.07,-0.01,1.0,-0.11,0.27,-0.0
gender_code,-0.13,0.01,-0.11,1.0,-0.19,-0.02
income_code,0.29,-0.0,0.27,-0.19,1.0,-0.02
loyalty_code,-0.15,0.01,-0.0,-0.02,-0.02,1.0


In [105]:
def reg_model(sales_df):
    X = sales_df.drop(['amount'], axis=1)
    y = sales_df['amount']

    model = LinearRegression(fit_intercept=True)

    model.fit(X, y)
    y_pred = model.predict(X)

    print("coef =", model.coef_)
    print("intercept =", model.intercept_)

    print(
      'mean_squared_error : ', mean_squared_error(y, y_pred))
    print(
      'mean_absolute_error : ', mean_absolute_error(y, y_pred))
    
    return model

In [106]:
reg_model(sales_df)

coef = [  9.99004364  -2.40918263 -20.38175593  37.91658311 -20.60299944]
intercept = 103.38437332867453
mean_squared_error :  14012.274170626422
mean_absolute_error :  73.33951418247793


LinearRegression()

Summary

In [107]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                 amount   R-squared:                       0.116
Model:                            OLS   Adj. R-squared:                  0.116
Method:                 Least Squares   F-statistic:                     447.1
Date:                Fri, 26 Aug 2022   Prob (F-statistic):               0.00
Time:                        16:50:05   Log-Likelihood:            -1.0528e+05
No. Observations:               17000   AIC:                         2.106e+05
Df Residuals:                   16994   BIC:                         2.106e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const            103.3844      5.819     17.