In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

df = pd.read_csv("https://raw.githubusercontent.com/joanby/python-ml-course/master/datasets/ecom-expense/Ecom%20Expense.csv")
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [16]:
dummy_gender = pd.get_dummies(df["Gender"], prefix="Gender")
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix="City")

In [17]:
dummy_gender.head()

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [35]:
df_new = df.join(dummy_gender)
df_new = df_new.join(dummy_city_tier)

column_names = df_new.columns.values.tolist()
column_names

['Transaction ID',
 'Age ',
 ' Items ',
 'Monthly Income',
 'Transaction Time',
 'Record',
 'Gender',
 'City Tier',
 'Total Spend',
 'Gender_Female',
 'Gender_Male',
 'City_Tier 1',
 'City_Tier 2',
 'City_Tier 3']

In [53]:
feature_cols = ["Monthly Income", "Transaction Time","Gender_Female", "Gender_Male", "City_Tier 1", "City_Tier 2","City_Tier 3", "Record", "Age "]


In [54]:
feature_cols

['Monthly Income',
 'Transaction Time',
 'Gender_Female',
 'Gender_Male',
 'City_Tier 1',
 'City_Tier 2',
 'City_Tier 3',
 'Record',
 'Age ']

In [55]:
X = df_new[feature_cols]
Y = df_new["Total Spend"]

In [56]:
lm = LinearRegression()
lm.fit(X, Y)

LinearRegression()

In [57]:
print(lm.intercept_)
print(lm.coef_)

-335.7380017453788
[ 1.47442269e-01  1.56391583e-01 -1.33088707e+02  1.33088707e+02
  7.83785050e+01  5.20259633e+01 -1.30404468e+02  7.72149205e+02
  6.42429817e+00]


In [58]:
list(zip(feature_cols, lm.coef_))

[('Monthly Income', 0.1474422689744859),
 ('Transaction Time', 0.15639158306366224),
 ('Gender_Female', -133.0887066317063),
 ('Gender_Male', 133.08870663170597),
 ('City_Tier 1', 78.37850497640328),
 ('City_Tier 2', 52.02596334431915),
 ('City_Tier 3', -130.40446832072257),
 ('Record', 772.1492053631356),
 ('Age ', 6.4242981676130455)]

In [60]:
lm.score(X,Y)

0.9187458997709432

RSE - RSE stands for Residual Standard Error, which is a metric used to measure the accuracy of a statistical model. It is a measure of the difference between the observed values and the predicted values of the dependent variable in a regression model.

In [61]:
df_new["prediction"] = lm.predict(pd.DataFrame(df_new[feature_cols]))

## Suma de cuadrados:

1. Suma de cuadrados totales (SST): la suma de las diferencias cuadradas entre los puntos de datos individuales (y i ) y la media de la variable de respuesta ( y ).

SST = Σ (y i – y ) 2


SSD (diferencias)

2. Regresión de la suma de cuadrados (SSR) : la suma de las diferencias cuadradas entre los puntos de datos predichos (ŷ i ) y la media de la variable de respuesta ( y ).

SSR = Σ (ŷ i – y ) 2


3. Error de suma de cuadrados (SSE) : la suma de las diferencias cuadradas entre los puntos de datos predichos (ŷ i ) y los puntos de datos observados (y i ).

SSE = Σ (ŷ i – y i ) 2

### SST = SSR + SSE



In [63]:
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,prediction
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0,4916.525671
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0,4690.334781
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0,5200.539037
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0,8130.623235
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0,3704.958811


In [74]:
SSD = np.sum((df_new["prediction"] - df_new["Total Spend"])**2)

In [75]:
SSD

1503788028.5340147

In [76]:
RSE = np.sqrt(SSD/(len(df_new)-len(feature_cols)-1))

In [77]:
RSE

799.6034382056597

In [78]:
sales_mean = np.mean(df_new["Total Spend"])

In [79]:
sales_mean

6163.176415976714

In [80]:
error = RSE/sales_mean

In [81]:
error#Molt petit

0.12973885286373746

## Delete redundant dummies

In [83]:
#Canviaran les alfes de cadascuna

In [85]:
#iloc[:,1:] selects all rows (:) and all columns starting from the second column (1:) of the new DataFrame dummy_gender. 
dummy_gender = pd.get_dummies(df["Gender"], prefix="Gender").iloc[:,1:]

In [86]:
dummy_gender.head()

Unnamed: 0,Gender_Male
0,0
1,0
2,1
3,0
4,0


In [89]:
dummy_city_tier = pd.get_dummies(df["City Tier"], prefix="City").iloc[:, 1:]
dummy_city_tier.head()

Unnamed: 0,City_Tier 2,City_Tier 3
0,0,0
1,1,0
2,1,0
3,0,0
4,1,0


In [94]:
column_names = df.columns.values.tolist()
df_new = df[column_names].join(dummy_gender)
column_names = df_new.columns.values.tolist()
df_new = df_new[column_names].join(dummy_city_tier)