# El tratamiento de las variables categóricas

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv('../datasets/ecom-expense/Ecom Expense.csv')

In [4]:
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [6]:
dummy_gender = pd.get_dummies(df['Gender'], prefix='Gender')
dummy_city_tier = pd.get_dummies(df['City Tier'], prefix='City')

In [7]:
dummy_gender.head()

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [8]:
dummy_city_tier.head()

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0


In [9]:
column_names = df.columns.values.tolist()
column_names

['Transaction ID',
 'Age ',
 ' Items ',
 'Monthly Income',
 'Transaction Time',
 'Record',
 'Gender',
 'City Tier',
 'Total Spend']

In [10]:
df_new = df[column_names].join(dummy_gender)
column_names = df_new.columns.values.tolist()
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0


In [14]:
df_new = df_new[column_names].join(dummy_city_tier)
column_names = df_new.columns.values.tolist()
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0


In [47]:
# Columnas que actuarán de variables predictoras
feature_cols = ['Monthly Income', 'Transaction Time', 'Gender_Male', 'Gender_Female', 'City_Tier 1', 'City_Tier 2', 'City_Tier 3', 'Record']

In [48]:
feature_cols

['Monthly Income',
 'Transaction Time',
 'Gender_Male',
 'Gender_Female',
 'City_Tier 1',
 'City_Tier 2',
 'City_Tier 3',
 'Record']

In [49]:
# Conjunto de datos con los cuales se creará el modelo
X = df_new[feature_cols]
# La predicción Y
Y = df_new['Total Spend']

In [50]:
lm = LinearRegression()
lm.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [51]:
print(lm.intercept_)
print(lm.coef_)

-79.41713030137362
[ 1.47538980e-01  1.54946125e-01  1.31025013e+02 -1.31025013e+02
  7.67643260e+01  5.51389743e+01 -1.31903300e+02  7.72233446e+02]


In [52]:
# Se juntan las features con sus respectivos coeficientes
list(zip(feature_cols, lm.coef_))

[('Monthly Income', 0.14753898049205735),
 ('Transaction Time', 0.1549461254958959),
 ('Gender_Male', 131.02501325554627),
 ('Gender_Female', -131.02501325554616),
 ('City_Tier 1', 76.76432601049508),
 ('City_Tier 2', 55.138974309232616),
 ('City_Tier 3', -131.90330031972772),
 ('Record', 772.233445744565)]

In [53]:
# Valor de R^2
lm.score(X, Y)

0.9179923586131016

El modelo puede ser escrito como:
    Total_Spend = -79.41713030137362 + 'Monthy Income' * 0.14753898049205735 + 'Transaction Time' * 0.1549461254958959 + 'Gender_Male' * 131.02501325554627 + 'Gender_Female' * -131.02501325554616 + 'City_Tier 1' * 76.76432601049508 + 'City_Tier 2' * 55.138974309232616 + 'City_Tier 3' * -131.90330031972772 + 'Record' * 772.233445744565

In [62]:
df_new['prediction'] = -79.41713030137362 + df_new['Monthly Income'] * 0.14753898049205735 + df_new['Transaction Time'] * 0.1549461254958959 + df_new['Gender_Male'] * 131.02501325554627 + df_new['Gender_Female'] * (-131.02501325554616) + df_new['City_Tier 1'] * 76.76432601049508 + df_new['City_Tier 2'] * 55.138974309232616 + df_new['City_Tier 3'] * (-131.90330031972772) + df_new['Record'] * 772.233445744565

In [61]:
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,prediction
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0,4903.69672
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0,4799.434826
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0,5157.082504
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0,8068.012996
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0,3581.980335


In [73]:
# Suma de los cuadrados de los erroes
SSD = np.sum((df_new['prediction'] - df_new['Total Spend'])**2)
SSD

1517733985.3408165

In [74]:
# Desviación típica de los residuos
RSE = np.sqrt(SSD/(len(df_new)-len(feature_cols)-1))
RSE

803.1318809818166

In [75]:
sales_mean = np.mean(df_new['Total Spend'])
sales_mean

6163.176415976715

In [76]:
error = RSE/sales_mean
error

0.1303113568029416