# The Data

We have been given a small sample data from our customers. The data has been aggregated from individual purchases / transactions across the time period (e.g., last year). The goal is to see if we can predict the spending amounts for the next time period (e.g., next year).

In [1]:
# import "standard" packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import packages to run regression
from statsmodels.formula.api import ols

# Another one from sklearn
from sklearn.linear_model import LinearRegression

In [2]:
# Read in the data
cust = pd.read_csv('./data/cust_small_clean.csv')
cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535 entries, 0 to 534
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   cust_id             535 non-null    object
 1   join_date           535 non-null    object
 2   gender              535 non-null    object
 3   age                 535 non-null    int64 
 4   marital_status      535 non-null    object
 5   household_income    535 non-null    int64 
 6   home_ownership      535 non-null    object
 7   num_children        535 non-null    int64 
 8   num_vehicles        535 non-null    int64 
 9   last_purchase_date  535 non-null    object
 10  spend               535 non-null    int64 
dtypes: int64(5), object(6)
memory usage: 46.1+ KB


In [3]:
# Take a peek at a few rows of data
cust.sample(5)

Unnamed: 0,cust_id,join_date,gender,age,marital_status,household_income,home_ownership,num_children,num_vehicles,last_purchase_date,spend
509,CUST-172867,2020-12-31 01:08:58,F,38,married,294970,rent,5,5,2021-04-19 19:43:48,1542
304,CUST-623305,2020-09-04 04:21:28,F,24,married,220342,unknown,3,3,2021-08-04 21:47:10,3297
97,CUST-306334,2018-04-19 00:07:33,M,55,unmarried,416893,rent,3,1,2021-06-13 18:33:57,4094
190,CUST-974879,2020-01-25 05:29:42,F,27,married,124362,unknown,4,5,2021-01-14 03:18:05,4565
56,CUST-109012,2020-06-18 21:02:13,F,40,married,310243,rent,1,3,2021-03-14 12:28:00,2963


## Descriptive Stuff

In [4]:
# statistical summaries
cust.describe()

Unnamed: 0,age,household_income,num_children,num_vehicles,spend
count,535.0,535.0,535.0,535.0,535.0
mean,37.502804,300413.493458,2.452336,2.519626,4147.779439
std,10.714524,114299.795076,1.636736,1.703458,1594.77806
min,20.0,65866.0,0.0,0.0,1480.0
25%,28.0,214390.5,1.0,1.0,2798.0
50%,38.0,306894.0,2.0,3.0,4141.0
75%,46.0,394944.0,4.0,4.0,5280.0
max,56.0,489951.0,5.0,5.0,7304.0


In [5]:
# Try again and include all columns
cust.describe(include='all')

Unnamed: 0,cust_id,join_date,gender,age,marital_status,household_income,home_ownership,num_children,num_vehicles,last_purchase_date,spend
count,535,535,535,535.000000,535,535.000000,535,535.000000,535.000000,535,535.000000
unique,535,535,2,,2,,3,,,535,
top,CUST-074173,2021-12-05 07:31:01,F,,married,,own,,,2021-06-28 12:39:52,
freq,1,1,379,,416,,336,,,1,
mean,,,,37.502804,,300413.493458,,2.452336,2.519626,,4147.779439
...,...,...,...,...,...,...,...,...,...,...,...
min,,,,20.000000,,65866.000000,,0.000000,0.000000,,1480.000000
25%,,,,28.000000,,214390.500000,,1.000000,1.000000,,2798.000000
50%,,,,38.000000,,306894.000000,,2.000000,3.000000,,4141.000000
75%,,,,46.000000,,394944.000000,,4.000000,4.000000,,5280.000000


In [6]:
cust.home_ownership.value_counts()

home_ownership
own        336
rent       145
unknown     54
Name: count, dtype: int64

## End Result for Input
We want to have all numerical variables. This means we should create *dummy* variables for `gender`, `marital_status`, and `home_ownership`. We also do not need `cust_id` since it is just a unique id. The two date columns could be used to create numerical values, but we will simply ignore them for now.

In [8]:
# What happens when we call get_dummies?
# Try to create dummy variables for gender, marital_status, and home_ownership
dummies = pd.get_dummies(cust[['gender','marital_status','home_ownership']], dtype=int)
dummies

Unnamed: 0,gender_F,gender_M,marital_status_married,marital_status_unmarried,home_ownership_own,home_ownership_rent,home_ownership_unknown
0,1,0,0,1,1,0,0
1,0,1,0,1,1,0,0
2,1,0,1,0,1,0,0
3,1,0,1,0,1,0,0
4,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...
530,0,1,1,0,1,0,0
531,1,0,1,0,0,1,0
532,0,1,0,1,1,0,0
533,1,0,1,0,1,0,0


In [9]:
# Let's drop the following columns:
# cust_id, join_date, last_purchase_date
# gender, marital_status, home_ownership
cust = cust.drop(columns=['cust_id','join_date','last_purchase_date',
                          'gender','marital_status','home_ownership'])
cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535 entries, 0 to 534
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   age               535 non-null    int64
 1   household_income  535 non-null    int64
 2   num_children      535 non-null    int64
 3   num_vehicles      535 non-null    int64
 4   spend             535 non-null    int64
dtypes: int64(5)
memory usage: 21.0 KB


In [10]:
# We now need to add the dummy variables
# However, remember we only need k-1 for k classes
# For gender that means just 1, ditto for marital_status
# For home_ownership we need 2
cust = pd.concat([cust,
                  dummies[['gender_F',
                           'marital_status_married',
                           'home_ownership_own',
                           'home_ownership_rent']]], axis=1)
cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535 entries, 0 to 534
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   age                     535 non-null    int64
 1   household_income        535 non-null    int64
 2   num_children            535 non-null    int64
 3   num_vehicles            535 non-null    int64
 4   spend                   535 non-null    int64
 5   gender_F                535 non-null    int64
 6   marital_status_married  535 non-null    int64
 7   home_ownership_own      535 non-null    int64
 8   home_ownership_rent     535 non-null    int64
dtypes: int64(9)
memory usage: 37.7 KB


In [11]:
cust.describe()

Unnamed: 0,age,household_income,num_children,num_vehicles,spend,gender_F,marital_status_married,home_ownership_own,home_ownership_rent
count,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0
mean,37.502804,300413.493458,2.452336,2.519626,4147.779439,0.708411,0.77757,0.628037,0.271028
std,10.714524,114299.795076,1.636736,1.703458,1594.77806,0.454919,0.416268,0.483781,0.444907
min,20.0,65866.0,0.0,0.0,1480.0,0.0,0.0,0.0,0.0
25%,28.0,214390.5,1.0,1.0,2798.0,0.0,1.0,0.0,0.0
50%,38.0,306894.0,2.0,3.0,4141.0,1.0,1.0,1.0,0.0
75%,46.0,394944.0,4.0,4.0,5280.0,1.0,1.0,1.0,1.0
max,56.0,489951.0,5.0,5.0,7304.0,1.0,1.0,1.0,1.0


## Using `statsmodels`

In [12]:
results = ols('spend ~ age + household_income + num_children + num_vehicles + gender_F + marital_status_married + home_ownership_own + home_ownership_rent',
              data=cust).fit()

In [13]:
results.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.361
Model:,OLS,Adj. R-squared:,0.351
Method:,Least Squares,F-statistic:,37.16
Date:,"Tue, 19 Sep 2023",Prob (F-statistic):,1.0099999999999999e-46
Time:,20:15:29,Log-Likelihood:,-4584.1
No. Observations:,535,AIC:,9186.0
Df Residuals:,526,BIC:,9225.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1789.5139,351.336,5.093,0.000,1099.319,2479.709
age,-7.5419,5.255,-1.435,0.152,-17.864,2.780
household_income,0.0024,0.001,4.788,0.000,0.001,0.003
num_children,-48.7476,34.147,-1.428,0.154,-115.829,18.334
num_vehicles,-46.2889,32.849,-1.409,0.159,-110.819,18.242
gender_F,1028.5860,123.125,8.354,0.000,786.708,1270.464
marital_status_married,865.5002,134.854,6.418,0.000,600.582,1130.418
home_ownership_own,1237.5127,191.662,6.457,0.000,860.996,1614.030
home_ownership_rent,-95.2526,206.110,-0.462,0.644,-500.152,309.647

0,1,2,3
Omnibus:,32.458,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16.469
Skew:,-0.243,Prob(JB):,0.000265
Kurtosis:,2.292,Cond. No.,2260000.0


In [14]:
# create new cust
new_cust = cust.drop(columns=['home_ownership_own'])
new_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535 entries, 0 to 534
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   age                     535 non-null    int64
 1   household_income        535 non-null    int64
 2   num_children            535 non-null    int64
 3   num_vehicles            535 non-null    int64
 4   spend                   535 non-null    int64
 5   gender_F                535 non-null    int64
 6   marital_status_married  535 non-null    int64
 7   home_ownership_rent     535 non-null    int64
dtypes: int64(8)
memory usage: 33.6 KB


In [15]:
new_cust = pd.concat([new_cust,
                  dummies[['home_ownership_unknown']]], axis=1)
new_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535 entries, 0 to 534
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   age                     535 non-null    int64
 1   household_income        535 non-null    int64
 2   num_children            535 non-null    int64
 3   num_vehicles            535 non-null    int64
 4   spend                   535 non-null    int64
 5   gender_F                535 non-null    int64
 6   marital_status_married  535 non-null    int64
 7   home_ownership_rent     535 non-null    int64
 8   home_ownership_unknown  535 non-null    int64
dtypes: int64(9)
memory usage: 37.7 KB


In [16]:
results2 = ols('spend ~ age + household_income + num_children + num_vehicles + gender_F + marital_status_married + home_ownership_rent + home_ownership_unknown',
              data=new_cust).fit()

In [17]:
results2.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.361
Model:,OLS,Adj. R-squared:,0.351
Method:,Least Squares,F-statistic:,37.16
Date:,"Tue, 19 Sep 2023",Prob (F-statistic):,1.0099999999999999e-46
Time:,20:29:27,Log-Likelihood:,-4584.1
No. Observations:,535,AIC:,9186.0
Df Residuals:,526,BIC:,9225.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3027.0266,317.760,9.526,0.000,2402.793,3651.260
age,-7.5419,5.255,-1.435,0.152,-17.864,2.780
household_income,0.0024,0.001,4.788,0.000,0.001,0.003
num_children,-48.7476,34.147,-1.428,0.154,-115.829,18.334
num_vehicles,-46.2889,32.849,-1.409,0.159,-110.819,18.242
gender_F,1028.5860,123.125,8.354,0.000,786.708,1270.464
marital_status_married,865.5002,134.854,6.418,0.000,600.582,1130.418
home_ownership_rent,-1332.7653,131.167,-10.161,0.000,-1590.440,-1075.090
home_ownership_unknown,-1237.5127,191.662,-6.457,0.000,-1614.030,-860.996

0,1,2,3
Omnibus:,32.458,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16.469
Skew:,-0.243,Prob(JB):,0.000265
Kurtosis:,2.292,Cond. No.,1880000.0


In [18]:
results3 = ols('spend ~ household_income + gender_F + marital_status_married + home_ownership_rent + home_ownership_unknown',
              data=new_cust).fit()
results3.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.354
Model:,OLS,Adj. R-squared:,0.348
Method:,Least Squares,F-statistic:,57.91
Date:,"Tue, 19 Sep 2023",Prob (F-statistic):,4.9499999999999994e-48
Time:,20:36:34,Log-Likelihood:,-4587.2
No. Observations:,535,AIC:,9186.0
Df Residuals:,529,BIC:,9212.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2495.2765,206.862,12.063,0.000,2088.905,2901.648
household_income,0.0024,0.001,4.789,0.000,0.001,0.003
gender_F,1033.3943,123.295,8.381,0.000,791.187,1275.602
marital_status_married,869.3411,134.826,6.448,0.000,604.482,1134.200
home_ownership_rent,-1301.8235,130.378,-9.985,0.000,-1557.946,-1045.702
home_ownership_unknown,-1259.6140,191.662,-6.572,0.000,-1636.127,-883.101

0,1,2,3
Omnibus:,30.556,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15.519
Skew:,-0.228,Prob(JB):,0.000427
Kurtosis:,2.302,Cond. No.,1340000.0


## Using `sklearn`

In [19]:
# Create the X and y
y = cust.spend

X = cust.drop('spend', axis=1)

In [20]:
y.shape

(535,)

In [21]:
X.shape

(535, 8)

In [22]:
reg = LinearRegression()
reg.fit(X, y)

In [23]:
reg.intercept_

1789.5139309983888

In [24]:
reg.coef_

array([-7.54193621e+00,  2.41083822e-03, -4.87475694e+01, -4.62888662e+01,
        1.02858605e+03,  8.65500154e+02,  1.23751266e+03, -9.52526197e+01])

In [26]:
newX = cust.sample(7)

In [29]:
newX = newX.drop('spend', axis=1)
newX

Unnamed: 0,age,household_income,num_children,num_vehicles,gender_F,marital_status_married,home_ownership_own,home_ownership_rent
176,41,452363,1,1,0,1,1,0
510,34,344550,3,0,1,1,0,1
120,24,475719,1,4,1,0,1,0
441,20,282202,3,3,1,0,1,0
459,46,482085,2,3,1,1,0,0
373,34,432297,2,0,1,1,1,0
93,49,342954,4,3,1,1,1,0


In [30]:
reg.predict(newX)

array([4578.84493356, 4016.33328455, 4787.5846834 , 4300.00797507,
       4262.53827558, 5609.38995299, 5044.50765295])