# El tratamiento de las variables categóricas

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv('./datasets/ecom-expense/Ecom Expense.csv')

In [4]:
df.head(2)

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648


In [5]:
dummy_gender = pd.get_dummies(df.Gender, prefix='Gender')
dummy_city_tier = pd.get_dummies(df['City Tier'], prefix='City')

In [6]:
dummy_gender.head()

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [7]:
dummy_city_tier.head()

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0


In [10]:
column_names = df.columns.values.tolist()
column_names

['Transaction ID',
 'Age ',
 ' Items ',
 'Monthly Income',
 'Transaction Time',
 'Record',
 'Gender',
 'City Tier',
 'Total Spend']

In [11]:
df_new = df[column_names].join(dummy_gender)
column_names = df_new.columns.values.tolist()

In [12]:
df_new.head(2)

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0


In [13]:
df_new = df_new[column_names].join(dummy_city_tier)

In [14]:
df_new.head(2)

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0


In [48]:
feature_cols = ['Monthly Income', 'Transaction Time', 'Gender_Female', 'Gender_Male', 'City_Tier 1', 'City_Tier 2', 'City_Tier 3', 'Record']

In [49]:
x = df_new[feature_cols]
y = df_new['Total Spend']

In [50]:
lm = LinearRegression()
lm.fit(x,y)

LinearRegression()

In [51]:
lm.intercept_

-79.41713030137271

In [52]:
lm.coef_

array([ 1.47538980e-01,  1.54946125e-01, -1.31025013e+02,  1.31025013e+02,
        7.67643260e+01,  5.51389743e+01, -1.31903300e+02,  7.72233446e+02])

In [53]:
list(zip(feature_cols, lm.coef_))

[('Monthly Income', 0.1475389804920575),
 ('Transaction Time', 0.15494612549589526),
 ('Gender_Female', -131.02501325554573),
 ('Gender_Male', 131.02501325554587),
 ('City_Tier 1', 76.76432601049483),
 ('City_Tier 2', 55.13897430923282),
 ('City_Tier 3', -131.90330031972766),
 ('Record', 772.2334457445643)]

In [54]:
lm.score(x,y)

0.9179923586131016

### $Total_Spend = -79.41713030137271 + 0.1475389804920575*Monthly Income + 0.15494612549589526*Transaction Time -131.02501325554573*Gender_Female + 131.02501325554587*Gender_Male + 76.76432601049483*City_Tier 1 + 55.13897430923282*City_Tier 2 -131.90330031972766*City_Tier 3 + 772.2334457445643*Record$

In [55]:
df_new['prediction'] = -79.41713030137271 + 0.1475389804920575*df_new['Monthly Income'] + 0.15494612549589526*df_new['Transaction Time'] -131.02501325554573*df_new['Gender_Female'] + 131.02501325554587*df_new['Gender_Male'] + 76.76432601049483*df_new['City_Tier 1'] + 55.13897430923282*df_new['City_Tier 2'] -131.90330031972766*df_new['City_Tier 3'] + 772.2334457445643*df_new['Record']

In [56]:
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,prediction
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0,4903.69672
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0,4799.434826
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0,5157.082504
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0,8068.012996
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0,3581.980335


In [61]:
SSD = np.sum((df_new.prediction - df_new['Total Spend'])**2)

In [62]:
SSD

1517733985.3408167

In [63]:
RSE = np.sqrt(SSD/(len(df_new)-len(feature_cols)-1))
RSE

803.1318809818166

In [64]:
sales_mean = np.mean(df_new['Total Spend'])
sales_mean

6163.176415976715

In [65]:
error = RSE/sales_mean
error

0.1303113568029416