# T2 - Regresiones Lineales con Variables Categóricas

In [901]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

In [902]:
# Se carga el dataset "Ecom Expense.csv"
df = pd.read_csv("/home/omarg/Master/Data Analisys and Machine Learning/Notes/Mineria de datos/ecom-expense/Ecom Expense.csv")
df

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485
...,...,...,...,...,...,...,...,...,...
2357,TXN2358,50,7,5705,460.157207,3,Male,Tier 2,2909.619546
2358,TXN2359,35,11,11202,851.924751,8,Male,Tier 2,7968.633136
2359,TXN2360,27,5,21335,435.145358,8,Female,Tier 3,8816.406448
2360,TXN2361,45,12,19294,658.439838,7,Female,Tier 1,7915.595856


In [903]:
# Generemos un conjunto de variables "dummy" para la variable categórica "Gender"
dummy_gender = pd.get_dummies(df["Gender"], prefix = "Gender", dtype=int)
dummy_gender

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
2357,0,1
2358,0,1
2359,1,0
2360,1,0


In [904]:
# Generemos un conjunto de variables "dummy" para la variable categórica "City Tier"
dummy_citytier = pd.get_dummies(df["City Tier"], prefix="City", dtype=int)
dummy_citytier

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0
...,...,...,...
2357,0,1,0
2358,0,1,0
2359,0,0,1
2360,1,0,0


In [905]:
# Sustituir las variables categoricas por sus respectivas variables dummy en el dataframe
df_mod = df.drop(columns=["Gender", "City Tier", "Total Spend"])
df_mod = df_mod.join([dummy_gender, dummy_citytier, df["Total Spend"]])
df_mod.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,Total Spend
0,TXN001,42,10,7313,627.668127,5,1,0,1,0,0,4198.385084
1,TXN002,24,8,17747,126.904567,3,1,0,0,1,0,4134.976648
2,TXN003,47,11,22845,873.469701,2,0,1,0,1,0,5166.614455
3,TXN004,50,11,18552,380.219428,7,1,0,1,0,0,7784.447676
4,TXN005,60,2,14439,403.374223,2,1,0,0,1,0,3254.160485


In [906]:
# Definir el conjunto de variables independientes y dependientes
feature_cols = df_mod.iloc[:,[1,2,3,4,6,7,8,9,10]].columns.values.tolist()
# feature_cols = ["Monthly Income", "Transaction Time", "Gender_Female", "Gender_Male", "City_Tier 1", "City_Tier 2", "City_Tier 3"]

X = df_mod[feature_cols]
Y = df_mod["Total Spend"]

In [907]:
# Definir el modelo de regresión lineal
lm = LinearRegression()
lm.fit(X,Y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [908]:
# Imprimir los valores de los coeficientes
print(lm.intercept_)
print(lm.coef_)

3003.658334704925
[  6.81849902  46.54999316   0.15320733   0.1403355  -94.75317836
  94.75317836 111.3407595  -21.93776518 -89.40299432]


In [909]:
list(zip(feature_cols, lm.coef_))

[('Age ', np.float64(6.818499020772234)),
 (' Items ', np.float64(46.549993156451094)),
 ('Monthly Income', np.float64(0.15320732667127146)),
 ('Transaction Time', np.float64(0.1403355049072499)),
 ('Gender_Female', np.float64(-94.7531783617553)),
 ('Gender_Male', np.float64(94.75317836175525)),
 ('City_Tier 1', np.float64(111.34075949658616)),
 ('City_Tier 2', np.float64(-21.937765180092526)),
 ('City_Tier 3', np.float64(-89.40299431649363))]

In [910]:
# Imprimir el valor del coeficiente de determinación (R^2) y el RMSE
R2 = lm.score(X,Y)
RMSE = np.sqrt(mean_squared_error(Y, lm.predict(X)))
R2_Adj = 1 - (1 - R2) * (len(Y) - 1) / (len(Y) - len(feature_cols) - 1)
print(f"R²: {R2:.4f}")
print(f"Root Mean Squared Error (RMSE): {RMSE:.4f}")
print(f"Adjusted R²: {R2_Adj:.4f}")

R²: 0.2008
Root Mean Squared Error (RMSE): 2502.4658
Adjusted R²: 0.1977


In [911]:
# Implementar el modelo para llevar a cabo la predicción para el conjunto de observaciones "X"
y_pred = lm.predict(X)

In [912]:
# Integrar las predicciones al dataframe
df_mod["Y_predicted"] = y_pred
df_mod.head(10)

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3,Total Spend,Y_predicted
0,TXN001,42,10,7313,627.668127,5,1,0,1,0,0,4198.385084,4980.61211
1,TXN002,24,8,17747,126.904567,3,1,0,0,1,0,4134.976648,6159.790956
2,TXN003,47,11,22845,873.469701,2,0,1,0,1,0,5166.614455,7531.593316
3,TXN004,50,11,18552,380.219428,7,1,0,1,0,0,7784.447676,6768.881401
4,TXN005,60,2,14439,403.374223,2,1,0,0,1,0,3254.160485,5657.945634
5,TXN006,49,6,6282,48.974268,2,0,1,0,1,0,2375.036467,4659.201414
6,TXN007,21,14,7086,961.203768,8,0,1,1,0,0,7494.474559,5225.158789
7,TXN008,58,9,8881,962.25374,10,0,1,0,0,1,10782.94492,5319.104033
8,TXN009,20,6,5635,858.328132,5,0,1,1,0,0,3854.277411,4609.199409
9,TXN010,48,12,20861,43.036737,4,1,0,0,1,0,5346.140262,6974.952886


### Eliminación de Variables Redundantes

In [913]:
# Cargaremos nuevamente el dataset "Ecom Expense.csv"
df = pd.read_csv("/home/omarg/Master/Data Analisys and Machine Learning/Notes/Mineria de datos/ecom-expense/Ecom Expense.csv")
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [914]:
# Crearemos nuevamente las variables "dummy" para la variable categórica "Gender", PERO le quitaremos una de sus columnas.
dummy_gender = pd.get_dummies(df["Gender"], prefix="Gender", dtype=int).iloc[:,1:]
dummy_gender.head()

Unnamed: 0,Gender_Male
0,0
1,0
2,1
3,0
4,0


In [915]:
# Generemos nuevamente las variables "dummy" para "City Tier", PERO le quitaremos una de sus columnas
dummy_citytier = pd.get_dummies(df["City Tier"], prefix="City", dtype=int).iloc[:,1:]
dummy_citytier.head()

Unnamed: 0,City_Tier 2,City_Tier 3
0,0,0
1,1,0
2,1,0
3,0,0
4,1,0


In [916]:
# Sustituir las variables categoricas por sus respectivas variables dummy en el dataframe
df_mod = df.drop(columns=["Gender", "City Tier", "Total Spend"])
df_mod = df_mod.join([dummy_gender, dummy_citytier, df["Total Spend"]])
df_mod.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender_Male,City_Tier 2,City_Tier 3,Total Spend
0,TXN001,42,10,7313,627.668127,5,0,0,0,4198.385084
1,TXN002,24,8,17747,126.904567,3,0,1,0,4134.976648
2,TXN003,47,11,22845,873.469701,2,1,1,0,5166.614455
3,TXN004,50,11,18552,380.219428,7,0,0,0,7784.447676
4,TXN005,60,2,14439,403.374223,2,0,1,0,3254.160485


In [917]:
# Definir el conjunto de variables independientes y dependientes
feature_cols = df_mod.iloc[:,[3,4,6,7,8]].columns.values.tolist()
# feature_cols = ["Monthly Income", "Transaction Time", "Gender_Male", "City_Tier 2", "City_Tier 3"]

X = df_mod[feature_cols]
Y = df_mod["Total Spend"]

In [918]:
# Definir el modelo de regresión lineal
lm = LinearRegression()
lm.fit(X,Y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [919]:
# Imprimir los valores de los coeficientes
print(lm.intercept_)
print(lm.coef_)

3681.2348604643867
[ 1.52978246e-01  1.23726086e-01  1.88315598e+02 -1.36342270e+02
 -2.22647485e+02]


In [920]:
list(zip(feature_cols, lm.coef_))

[('Monthly Income', np.float64(0.15297824609320482)),
 ('Transaction Time', np.float64(0.12372608642590283)),
 ('Gender_Male', np.float64(188.31559766064026)),
 ('City_Tier 2', np.float64(-136.34226961189157)),
 ('City_Tier 3', np.float64(-222.64748519981194))]

In [921]:
# Imprimir el valor del coeficiente de determinación (R^2) y el RMSE
R2 = lm.score(X,Y)
RMSE = np.sqrt(mean_squared_error(Y, lm.predict(X)))
R2_Adj = 1 - (1 - R2) * (len(Y) - 1) / (len(Y) - len(feature_cols) - 1)
print(f"R²: {R2:.4f}")
print(f"Root Mean Squared Error (RMSE): {RMSE:.4f}")
print(f"Adjusted R²: {R2_Adj:.4f}")


R²: 0.1948
Root Mean Squared Error (RMSE): 2511.8027
Adjusted R²: 0.1931


In [922]:
RMSE = np.sqrt(np.mean((Y - lm.predict(X))**2))

In [923]:
# Implementar el modelo para llevar a cabo la predicción para el conjunto de observaciones "X"
y_pred = lm.predict(X)

In [924]:
# Integrar las predicciones al dataframe
df_mod["Y_predicted"] = y_pred
df_mod.head(10)

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender_Male,City_Tier 2,City_Tier 3,Total Spend,Y_predicted
0,TXN001,42,10,7313,627.668127,5,0,0,0,4198.385084,4877.623695
1,TXN002,24,8,17747,126.904567,3,0,1,0,4134.976648,6275.49893
2,TXN003,47,11,22845,873.469701,2,1,1,0,5166.614455,7336.067208
3,TXN004,50,11,18552,380.219428,7,0,0,0,7784.447676,6566.330344
4,TXN005,60,2,14439,403.374223,2,0,1,0,3254.160485,5803.6534
5,TXN006,49,6,6282,48.974268,2,1,1,0,2375.036467,4700.276925
6,TXN007,21,14,7086,961.203768,8,1,0,0,7494.474559,5072.48029
7,TXN008,58,9,8881,962.25374,10,1,0,1,10782.94492,5124.558666
8,TXN009,20,6,5635,858.328132,5,1,0,0,3854.277411,4837.780455
9,TXN010,48,12,20861,43.036737,4,0,1,0,5346.140262,6741.49655


### Integración de la variable "Record" al modelo de predicción

In [925]:
# Definir el conjunto de variables independientes y dependientes, incluyendo la variable "Record"
feature_cols = df_mod.iloc[:,[3,4,5,6,7,8]].columns.values.tolist()
print(feature_cols)

['Monthly Income', 'Transaction Time', 'Record', 'Gender_Male', 'City_Tier 2', 'City_Tier 3']


In [926]:
# Definir las variables independientes y dependientes
X = df_mod[feature_cols]
Y = df_mod["Total Spend"]

In [927]:
# Definir el modelo de regresión lineal
lm = LinearRegression()
lm.fit(X,Y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [928]:
# Imprimir los valores de los coeficientes
print(lm.intercept_)
print(lm.coef_)

-133.67781754642328
[ 1.47538980e-01  1.54946125e-01  7.72233446e+02  2.62050027e+02
 -2.16253517e+01 -2.08667626e+02]


In [929]:
list(zip(feature_cols, lm.coef_))

[('Monthly Income', np.float64(0.14753898049205752)),
 ('Transaction Time', np.float64(0.15494612549589748)),
 ('Record', np.float64(772.2334457445639)),
 ('Gender_Male', np.float64(262.05002651109305)),
 ('City_Tier 2', np.float64(-21.625351701262545)),
 ('City_Tier 3', np.float64(-208.6676263302229))]

In [930]:
# Imprimir el valor del coeficiente de determinación (R^2) y el RMSE
R2 = lm.score(X,Y)
R2_Adj = 1 - (1 - R2) * (len(Y) - 1) / (len(Y) - len(feature_cols) - 1)
RMSE = np.sqrt(mean_squared_error(Y, lm.predict(X)))
print(f"R²: {R2:.4f}")
print(f"Root Mean Squared Error (RMSE): {RMSE:.4f}")
print(f"Adjusted R²: {R2_Adj:.4f}")

R²: 0.9180
Root Mean Squared Error (RMSE): 801.6003
Adjusted R²: 0.9178


In [931]:
# Implementar el modelo para llevar a cabo la predicción para el conjunto de observaciones "X"
y_pred = lm.predict(X)

In [932]:
# Integrar las predicciones al dataframe
df_mod["Y_predicted"] = y_pred
df_mod.head(10)

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender_Male,City_Tier 2,City_Tier 3,Total Spend,Y_predicted
0,TXN001,42,10,7313,627.668127,5,0,0,0,4198.385084,4903.69672
1,TXN002,24,8,17747,126.904567,3,0,1,0,4134.976648,4799.434826
2,TXN003,47,11,22845,873.469701,2,1,1,0,5166.614455,5157.082504
3,TXN004,50,11,18552,380.219428,7,0,0,0,7784.447676,8068.012996
4,TXN005,60,2,14439,403.374223,2,0,1,0,3254.160485,3581.980335
5,TXN006,49,6,6282,48.974268,2,1,1,0,2375.036467,2585.641997
6,TXN007,21,14,7086,961.203768,8,1,0,0,7494.474559,7500.63579
7,TXN008,58,9,8881,962.25374,10,1,0,1,10782.94492,9101.430215
8,TXN009,20,6,5635,858.328132,5,1,0,0,3854.277411,4953.916211
9,TXN010,48,12,20861,43.036737,4,0,1,0,5346.140262,6018.109661


### 1. ¿Qué pasaría si integramos la variable "Record" al modelo de predicción? ¿Mejoraría con esto la bondad de ajuste del modelo?

Al integrar la variable "Record" al modelo de predicción, se pudo observar que hizo las predicciones del modelo más precisas, como lo podemos observar en el valor de RMSE, el cual disminuyo considerablemente. Podemos ver que sin la variable "Record", el RMSE era de **2511.8027**, mientras que con la variable "Record", el RMSE disminuyo a **801.6003**.
Por ende, podemos decir que sí mejoró la bondad de ajuste del modelo al integrar la variable "Record". Esto podemos verlo con el valor de R², el cual aumentó de **0.1948** a **0.9180** al integrar la variable "Record". De igual manera, el valor de R² ajustado aumentó de **0.1931** a **0.9178**.

### Impacto negativo de variables

In [933]:
# Cargaremos nuevamente el dataset "Ecom Expense.csv"
feature_cols = df_mod.iloc[:,[3,4,5,6,7,8]].columns.values.tolist()
print(feature_cols)
print(df_mod[feature_cols].dtypes)

['Monthly Income', 'Transaction Time', 'Record', 'Gender_Male', 'City_Tier 2', 'City_Tier 3']
Monthly Income        int64
Transaction Time    float64
Record                int64
Gender_Male           int64
City_Tier 2           int64
City_Tier 3           int64
dtype: object


In [934]:
# Definir las variables independientes y dependientes
X = df_mod[feature_cols]
Y = df_mod["Total Spend"]

In [935]:
# Se usa statsmodels para obtener un resumen estadístico del modelo
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            Total Spend   R-squared:                       0.918
Model:                            OLS   Adj. R-squared:                  0.918
Method:                 Least Squares   F-statistic:                     4394.
Date:                Sat, 11 Oct 2025   Prob (F-statistic):               0.00
Time:                        18:14:53   Log-Likelihood:                -19145.
No. Observations:                2362   AIC:                         3.830e+04
Df Residuals:                    2355   BIC:                         3.834e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             -133.6778     60.830  

### 2. ¿Existe alguna variable que pueda estar impactando en forma negativa las predicciones de nuestro modelo?

Sí, en este caso está impactando de forma negativa la variable "City_Tier 2", ya que no es estadísticamente significativa, ya que su valor p es mayor a 0.05. Por lo tanto, esta variable no aporta información relevante al modelo y podría ser eliminada para mejorar la precisión del mismo.