Importamos librerias

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.model_selection import cross_val_score
from sklearn import metrics

Función de One Hot Enconding

In [30]:
# Archivo utilizado para crear la función de One Hot Encoding
############################################################# 

# Función auxiliar: Toma un df de variables categoricas y selección el top_x de categorias
def one_hot_top_x(cat_df, variable,top_x_labels):
    for label in top_x_labels:
        cat_df[str(variable)+'_'+str(label)] = np.where(cat_df[variable]==label,1,0)

# Función One Hot Encoder: Toma un DF y le hace una transformación de One Hot Encoder para con las categorias más repetidas 
def one_hot_encoder(df,top_x):
    # Separo variables categoricas de númericas
    num_cols = df.select_dtypes(include=['float64', 'int']).columns.to_list()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.to_list()
    
    # Armo un DF solo con variables categoricas
    cat_df = df.drop(num_cols,axis=1)
    # Armo un DF solo con variables categoricas
    num_df = df.drop(cat_cols,axis=1)
   
    # Itero entre las variables categoricas y me quedo con el top 10 variables
    for variable in cat_cols:
        top_10 = [x for x in cat_df[variable].value_counts().sort_values(ascending=False).head(top_x).index]
        one_hot_top_x(cat_df,variable,top_10)
    
    cat_df = cat_df.drop(cat_cols,axis=1)
    df = pd.concat([cat_df,num_df], axis =1)
    
    return df

Importamos datos

In [31]:
# Levantamos los datos procesados
datos_meli = pd.read_excel('data_ready_to_model_api_simple.xlsx')
datos_meli['type'] = 'meli'

datos_kavak = pd.read_excel('base_kavak.xlsx')
datos_kavak['type'] = 'kavak' 

# Edito el motor a número
datos_kavak['Motor'] = datos_kavak['Motor'].astype(str)

# Uno los DF
frames = [datos_meli, datos_kavak]
datos = pd.concat(frames)

# Los paso por One Hot Encoder
datos = one_hot_encoder(datos,100)

# Separo los datos de Meli
datos_meli = datos[(datos['type_meli'] == 1)] 
datos_meli = datos_meli.drop('type_kavak',axis=1)
datos_meli = datos_meli.drop('type_meli',axis=1)

  cat_df[str(variable)+'_'+str(label)] = np.where(cat_df[variable]==label,1,0)


Creamos grupos de Train & Test

In [32]:
x = datos_meli.drop('price',axis=1)

#separte the predicting attribute into Y for model training 
y = datos_meli['price']
y_ln = np.log(datos_meli['price'])

X_train, X_test, y_train, y_test = train_test_split(
                                        x,
                                        y,
                                        train_size   = 0.8,
                                        random_state = 1234,
                                        shuffle      = True
                                    )

#%%
print("Partición de entrenamiento")
print("-----------------------")
print(y_train.describe())

print("Partición de test")
print("-----------------------")
print(y_test.describe())

Partición de entrenamiento
-----------------------
count    4.547000e+03
mean     3.363625e+06
std      1.742930e+06
min      5.500000e+05
25%      2.130000e+06
50%      2.950000e+06
75%      4.200000e+06
max      3.070000e+07
Name: price, dtype: float64
Partición de test
-----------------------
count    1.137000e+03
mean     3.289235e+06
std      1.658179e+06
min      4.336800e+05
25%      2.100000e+06
50%      2.899900e+06
75%      4.250000e+06
max      1.250000e+07
Name: price, dtype: float64


Entrenamos el modelo

In [21]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Miramos resultados

In [22]:
print('Mean Absolute Error:', round(metrics.mean_absolute_error(y_test, y_pred)/1000,2))

# MAE: Mean Absolute Error: 1.03 --> Con outliers
# MAE: Mean Absolute Error: 0.56 --> Sin ourliers
# Mean Absolute Error: 0.48 --> Aplicando LN a Precio (error)
# Mean Absolute Error: 0.86

print('Mean Squared Error:', round(metrics.mean_squared_error(y_test, y_pred)/1000,2))
# MSE: Mean Squared Error: 3,732,143.07
# MSE: Mean Squared Error: 6,599,540.41 --> Con LN  

print('Root Mean Squared Error:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred))/1000,2))
# Root Mean Squared Error: 1.93
# Root Mean Squared Error: 0.74
# Root Mean Squared Error: 2.57 --> Con LN


# Con Api
# Mean Absolute Error: 0.42
# Mean Squared Error: 405827.91
# Root Mean Squared Error: 0.64


Mean Absolute Error: 417.3
Mean Squared Error: 405827913.16
Root Mean Squared Error: 637.05


Entrenamos un Modelo con Logaritmo y vemos sus resultados

In [33]:
# Separamos en Train & Test

X_train, X_test, y_train_ln, y_test_ln = train_test_split(
                                        x,
                                        y_ln,
                                        train_size   = 0.8,
                                        random_state = 1234,
                                        shuffle      = True
                                    )

#%%
print("Partición de entrenamiento")
print("-----------------------")
print(y_train_ln.describe())

print("Partición de test")
print("-----------------------")
print(y_test_ln.describe())

# Entrenamos el modelo

model = LinearRegression()
model.fit(X_train, y_train_ln)
y_pred_ln = model.predict(X_test)

#  Saco exponencial de los valores de Y
y_test = np.exp(y_test_ln)
y_pred = np.exp(y_pred_ln)

print('Mean Absolute Error:', round(metrics.mean_absolute_error(y_test, y_pred)/1000,2))

print('Mean Squared Error:', round(metrics.mean_squared_error(y_test, y_pred)/1000,2))

print('Root Mean Squared Error:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred))/1000,2))


# Mean Absolute Error: 331.45
# Mean Squared Error: 274555797.78
# Root Mean Squared Error: 523.98

Partición de entrenamiento
-----------------------
count    4547.000000
mean       14.909771
std         0.487622
min        13.217674
25%        14.571633
50%        14.897316
75%        15.250595
max        17.239773
Name: price, dtype: float64
Partición de test
-----------------------
count    1137.000000
mean       14.886646
std         0.494426
min        12.980062
25%        14.557448
50%        14.880187
75%        15.262430
max        16.341239
Name: price, dtype: float64
Mean Absolute Error: 322.44
Mean Squared Error: 273942342.62
Root Mean Squared Error: 523.4


Medimos los datos de Kavak

In [34]:
####################################################
#### Prueba de datos de Kavak
#%%
# Separo en Y y X
datos_kavak = datos[(datos['type_kavak'] == 1)] 
datos_kavak = datos_kavak.drop('type_kavak',axis=1)
datos_kavak = datos_kavak.drop('type_meli',axis=1)

#separate the other attributes from the predicting attribute
x_kavak = datos_kavak.drop('price',axis=1)

#separte the predicting attribute into Y for model training 
y_kavak = np.log(datos_kavak['price'])

# Predigo las x_kavak
y_pred_kavak_ln = model.predict(x_kavak)

#  Saco exponencial de los valores de Y
y_pred_kavak = np.exp(y_pred_kavak_ln)

print('Mean Absolute Error:', round(metrics.mean_absolute_error(y_kavak, y_pred_kavak)/1000,2))
print('Mean Squared Error:', round(metrics.mean_squared_error(y_kavak, y_pred_kavak)/1000,2))
print('Root Mean Squared Error:', round(np.sqrt(metrics.mean_squared_error(y_kavak, y_pred_kavak))/1000,2))
# Mean Absolute Error: 3030.07
# Mean Squared Error: 10459841498.76
# Root Mean Squared Error: 3234.17

Mean Absolute Error: 3414.87
Mean Squared Error: 13674680874.97
Root Mean Squared Error: 3697.93
