In [166]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 

In [167]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from time import sleep
from winsound import Beep
from pathlib import Path
import yaml

In [168]:
def write_yaml(file_path: str, data: object):
    """
    Write data to a YAML file.

    Args:
        file_path (str): Path of the YAML file.
        data (object): Data to write.

    Returns:
        bool: True if the file was written successfully, False otherwise.
    """
    try:
        with open(Path(file_path), "w") as f:
            yaml.dump(data, f)
        print("YAML file write: OK")
        return True
    except (FileNotFoundError, IOError, yaml.YAMLError) as err:
        print(f"Failed to write YAML file {file_path}: {err}")
        return False

In [169]:
def bee(r = 3, f = 2500, d = 1000, p = 1):
    for n in range(0,r):
        Beep(f, d)
        sleep(p)


In [170]:
parameter = {}
score = {}

In [171]:
heavy_df = pd.read_csv("../data/processed/heavy.csv", index_col="Unnamed: 0")

In [172]:
heavy_df.columns

Index(['score', 'read', 'pending', 'following', 'favorite', 'have',
       'abandoned', 'chapters_count', 'Magia_score', 'Mecha_score',
       'Demonios_score', 'Género Bender_score', 'Realidad Virtual_score',
       'Drama_score', 'Niños_score', 'Guerra_score', 'Harem_score',
       'Vampiros_score', 'Horror_score', 'Acción_score', 'Realidad_score',
       'Traps_score', 'Militar_score', 'Crimen_score',
       'Recuentos de la vida_score', 'Apocalíptico_score', 'Psicológico_score',
       'Misterio_score', 'Musica_score', 'Extranjero_score', 'Samurái_score',
       'Girls Love_score', 'Telenovela_score', 'Policiaco_score',
       'Animación_score', 'Parodia_score', 'Deporte_score',
       'Supervivencia_score', 'Aventura_score', 'Oeste_score',
       'Superpoderes_score', 'Ecchi_score', 'Tragedia_score', 'Fantasia_score',
       'Gore_score', 'Boys Love_score', 'Reencarnación_score',
       'Sobrenatural_score', 'Vida Escolar_score', 'Historia_score',
       'Romance_score', 'Ciencia 

In [173]:
#
#
#

In [174]:
X = heavy_df.drop("score", axis=1)
y = heavy_df["score"]

In [175]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [176]:
# Creo el scaler con los datos de train
scal = StandardScaler() # Declaro el scaler
scal.fit(X_train) # Lo "entreno". Calculo su media y std para cada feature
X_train_scal = scal.transform(X_train) # Aplico el scaler y sobreescribo los datos de train

# Aplico el mismo scaler con los datos de test
X_test_scal = scal.transform(X_test)

# Si quiero recuperar la anterior escala
#X_train = scal.inverse_transform(X_train)
#print(X_train[0])

In [177]:
poly_feats = PolynomialFeatures(degree = 2)
poly_feats.fit(X)
X_poly = poly_feats.transform(X)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [178]:
parameter["plr"] = {"poli__degree": [2]}

In [179]:
predictions = lin_reg.predict(X_test)

In [180]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = lin_reg.score(X_train, y_train)
R2_Test = lin_reg.score(X_test, y_test)

In [181]:
score["plr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [182]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 1.6264203088429108
MAPE: 3241180710587341.5
MSE: 4.441959060468379
RMSE: 2.1075955637807695
R2_Train: 0.7037222469074746
R2_Test: 0.7067121408725128


In [183]:
len(X_train.columns)

60

In [184]:
model = RandomForestRegressor(random_state=10)

parameters = {"n_estimators":[150,200],
            "max_depth":[20,21,22],
            "min_samples_leaf": [1,2,3],
            "max_features": [58,59,60]
}

# "criterion": ['absolute_error'],
# "min_samples_split": [2,5,7],
# "max_features": [6]

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [185]:
dtr_gs.fit(X_train, y_train)

In [186]:
dtr_gs.best_params_

{'max_depth': 21,
 'max_features': 60,
 'min_samples_leaf': 1,
 'n_estimators': 150}

'max_depth': 21,
 'max_features': 60,
 'min_samples_leaf': 1,
 'n_estimators': 150

In [187]:
best_params = {}
[best_params.update({f"regressor__{key}": [value]}) for key, value in dtr_gs.best_params_.items()]
parameter["rfr"] = best_params

In [188]:
feature_importances = dtr_gs.best_estimator_.feature_importances_
feature_importances_dict = dict(zip(X.columns, feature_importances))
feature_importances_sort = sorted(feature_importances_dict.items(), key=lambda x:x[1], reverse=True)

In [189]:
feature_importances_sort

[('tdg', 0.6250947915871793),
 ('pending', 0.0907002346519572),
 ('type_score', 0.0450274136510725),
 ('read', 0.03166637440404028),
 ('favorite', 0.026034749353240554),
 ('chapters_count', 0.022339008995303996),
 ('Vida Escolar_score', 0.019962191786438262),
 ('book_status_score', 0.01941602598877113),
 ('have', 0.014885559061074725),
 ('following', 0.014839904821373786),
 ('demography_score', 0.014217291892502093),
 ('abandoned', 0.010971386893193293),
 ('Acción_score', 0.006400333274423379),
 ('Aventura_score', 0.003399567040925213),
 ('Demonios_score', 0.003262877534876931),
 ('Sobrenatural_score', 0.003121806256740565),
 ('Fantasia_score', 0.0026797664029933935),
 ('Recuentos de la vida_score', 0.0023816256122150417),
 ('Boys Love_score', 0.0023763107528578563),
 ('Crimen_score', 0.0022716657157703835),
 ('Deporte_score', 0.002262467994271685),
 ('Romance_score', 0.0021328929363714014),
 ('Apocalíptico_score', 0.0021247150239417753),
 ('Comedia_score', 0.002055856124020214),
 ('Su

In [190]:
predictions = dtr_gs.predict(X_test)

In [191]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test, y_test)

In [192]:
score["rfr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [193]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 0.751342174053227
MAPE: 1314926994145115.2
MSE: 1.9080463851664147
RMSE: 1.3813205222418201
R2_Train: 0.9797271006145702
R2_Test: 0.8740180105661777


MAE: 0.7529307423506498
MAPE: 1312641837967386.2
MSE: 1.9046829397667226
RMSE: 1.380102510600833
R2_Train: 0.9788168360190393
R2_Test: 0.8742400877368898

In [194]:
model = ElasticNet()

parameters = {"alpha": [0.1, 0.5, 1.0],
            "l1_ratio": [0.3, 0.5, 0.7, 0.9],
            "max_iter": [1000, 2000]
}

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [195]:
dtr_gs.fit(X_train, y_train)

In [196]:
best_params = {}
[best_params.update({f"regressor__{key}": [value]}) for key, value in dtr_gs.best_params_.items()]
parameter["en"] = best_params

In [197]:
predictions = dtr_gs.predict(X_test)

In [198]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test, y_test)

In [199]:
score["en"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [200]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 1.9736310535663755
MAPE: 4220003495667564.5
MSE: 5.984538432553491
RMSE: 2.4463316276730533
R2_Train: 0.602765646466361
R2_Test: 0.6048607290485233


In [201]:
model = SVR()

parameters = {
            "kernel": ["rbf", "poly"],
            "C": [0.5, 1, 2],
            "epsilon": [0.01 ,0.1, 0.3],
}
# ["rbf", "linear", "poly"]
# "C": [0.1],
# "epsilon": [0.01],

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [202]:
dtr_gs.fit(X_train_scal, y_train)

In [203]:
best_params = {}
[best_params.update({f"regressor__{key}": [value]}) for key, value in dtr_gs.best_params_.items()]
parameter["svr"] = best_params

In [204]:
dtr_gs.best_params_

{'C': 2, 'epsilon': 0.01, 'kernel': 'rbf'}

In [205]:
predictions = dtr_gs.predict(X_test_scal)

In [206]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train_scal, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test_scal, y_test)

In [207]:
score["svr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [208]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 1.2523361553789056
MAPE: 3449866583858207.0
MSE: 4.464253488724695
RMSE: 2.1128780108479273
R2_Train: 0.7632441876465479
R2_Test: 0.7052401135429595


In [209]:
score

{'plr': {'MAE': 1.6264203088429108,
  'MAPE': 3241180710587341.5,
  'MSE': 4.441959060468379,
  'RMSE': 2.1075955637807695,
  'R2_Train': 0.7037222469074746,
  'R2_Test': 0.7067121408725128},
 'rfr': {'MAE': 0.751342174053227,
  'MAPE': 1314926994145115.2,
  'MSE': 1.9080463851664147,
  'RMSE': 1.3813205222418201,
  'R2_Train': 0.9797271006145702,
  'R2_Test': 0.8740180105661777},
 'en': {'MAE': 1.9736310535663755,
  'MAPE': 4220003495667564.5,
  'MSE': 5.984538432553491,
  'RMSE': 2.4463316276730533,
  'R2_Train': 0.602765646466361,
  'R2_Test': 0.6048607290485233},
 'svr': {'MAE': 1.2523361553789056,
  'MAPE': 3449866583858207.0,
  'MSE': 4.464253488724695,
  'RMSE': 2.1128780108479273,
  'R2_Train': 0.7632441876465479,
  'R2_Test': 0.7052401135429595}}

In [210]:
parameter

{'plr': {'poli__degree': [2]},
 'rfr': {'regressor__max_depth': [21],
  'regressor__max_features': [60],
  'regressor__min_samples_leaf': [1],
  'regressor__n_estimators': [150]},
 'en': {'regressor__alpha': [0.1],
  'regressor__l1_ratio': [0.9],
  'regressor__max_iter': [1000]},
 'svr': {'regressor__C': [2],
  'regressor__epsilon': [0.01],
  'regressor__kernel': ['rbf']}}

In [211]:
write_yaml("../models/heavy/model_config.yaml", parameter)

YAML file write: OK


True

In [212]:
write_yaml("../models/heavy/score.yaml", score)

YAML file write: OK


True

In [213]:
bee()