In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 

In [2]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from time import sleep
from winsound import Beep
from pathlib import Path
import yaml

In [3]:
def write_yaml(file_path: str, data: object):
    """
    Write data to a YAML file.

    Args:
        file_path (str): Path of the YAML file.
        data (object): Data to write.

    Returns:
        bool: True if the file was written successfully, False otherwise.
    """
    try:
        with open(Path(file_path), "w") as f:
            yaml.dump(data, f)
        print("YAML file write: OK")
        return True
    except (FileNotFoundError, IOError, yaml.YAMLError) as err:
        print(f"Failed to write YAML file {file_path}: {err}")
        return False

In [4]:
def bee(r = 3, f = 2500, d = 1000, p = 1):
    for n in range(0,r):
        Beep(f, d)
        sleep(p)


In [5]:
parameter = {}
score = {}

In [6]:
heavy_df = pd.read_csv("../data/processed/heavy.csv", index_col="Unnamed: 0")

In [7]:
heavy_df.columns

Index(['score', 'read', 'pending', 'following', 'favorite', 'have',
       'abandoned', 'Magia', 'Mecha', 'Demonios', 'Género Bender',
       'Realidad Virtual', 'Drama', 'Niños', 'Guerra', 'Harem', 'Vampiros',
       'Horror', 'Acción', 'Realidad', 'Traps', 'Militar', 'Crimen',
       'Recuentos de la vida', 'Apocalíptico', 'Psicológico', 'Misterio',
       'Musica', 'Extranjero', 'Samurái', 'Girls Love', 'Telenovela',
       'Policiaco', 'Animación', 'Parodia', 'Deporte', 'Supervivencia',
       'Aventura', 'Oeste', 'Superpoderes', 'Ecchi', 'Tragedia', 'Fantasia',
       'Gore', 'Boys Love', 'Reencarnación', 'Sobrenatural', 'Vida Escolar',
       'Historia', 'Romance', 'Ciencia Ficción', 'Thriller', 'Ciberpunk',
       'Artes Marciales', 'Comedia', 'Familia', 'chapters_count', 'type_score',
       'demography_score', 'book_status_score'],
      dtype='object')

In [8]:
#
#
#

In [9]:
X = heavy_df.drop("score", axis=1)
y = heavy_df["score"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [11]:
# Creo el scaler con los datos de train
scal = StandardScaler() # Declaro el scaler
scal.fit(X_train) # Lo "entreno". Calculo su media y std para cada feature
X_train_scal = scal.transform(X_train) # Aplico el scaler y sobreescribo los datos de train

# Aplico el mismo scaler con los datos de test
X_test_scal = scal.transform(X_test)

# Si quiero recuperar la anterior escala
#X_train = scal.inverse_transform(X_train)
#print(X_train[0])

In [12]:
poly_feats = PolynomialFeatures(degree = 2)
poly_feats.fit(X)
X_poly = poly_feats.transform(X)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [13]:
parameter["plr"] = {"poli__degree": [2]}

In [14]:
predictions = lin_reg.predict(X_test)

In [15]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = lin_reg.score(X_train, y_train)
R2_Test = lin_reg.score(X_test, y_test)

In [16]:
score["plr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [17]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 2.8830125049535185
MAPE: 5922824590671005.0
MSE: 11.812982560887304
RMSE: 3.4370019727790826
R2_Train: 0.20923054619125203
R2_Test: 0.2200278485169882


In [18]:
model = RandomForestRegressor(random_state=10)

parameters = {"n_estimators":[150,200],
            "max_depth":[13],
            "min_samples_leaf": [2,3,4],
            
}

# "criterion": ['absolute_error'],
# "min_samples_split": [2,5,7],
# "max_features": [6]

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [19]:
dtr_gs.fit(X_train, y_train)

In [21]:
dtr_gs.best_params_

{'max_depth': 13, 'min_samples_leaf': 3, 'n_estimators': 200}

In [22]:
best_params = {}
[best_params.update({f"regressor__{key}": [value]}) for key, value in dtr_gs.best_params_.items()]
parameter["rfr"] = best_params

In [23]:
feature_importances = dtr_gs.best_estimator_.feature_importances_
feature_importances_dict = dict(zip(X.columns, feature_importances))
feature_importances_sort = sorted(feature_importances_dict.items(), key=lambda x:x[1], reverse=True)

In [24]:
feature_importances_sort

[('read', 0.20934720542113103),
 ('pending', 0.14474376268226805),
 ('book_status_score', 0.14069030980788055),
 ('have', 0.10155422422354449),
 ('chapters_count', 0.09463015297110763),
 ('favorite', 0.09049291033120749),
 ('abandoned', 0.051778004760106626),
 ('following', 0.04871281578957063),
 ('demography_score', 0.02933266005585864),
 ('type_score', 0.017621900192642384),
 ('Boys Love', 0.006710973167282624),
 ('Comedia', 0.0053323869426777915),
 ('Vida Escolar', 0.005276387842391123),
 ('Romance', 0.004865077264777741),
 ('Drama', 0.004659267410261138),
 ('Recuentos de la vida', 0.004259645047455224),
 ('Ecchi', 0.003452063042210026),
 ('Harem', 0.003264288905673837),
 ('Fantasia', 0.0032500639506123853),
 ('Acción', 0.0031333691806410937),
 ('Sobrenatural', 0.002793194444122379),
 ('Aventura', 0.0024463525211893948),
 ('Girls Love', 0.001855773730495242),
 ('Tragedia', 0.0018438700911224237),
 ('Psicológico', 0.0017505861655575977),
 ('Ciencia Ficción', 0.001646622086374287),
 (

In [25]:
predictions = dtr_gs.predict(X_test)

In [26]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test, y_test)

In [27]:
score["rfr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [28]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 1.9505762879326645
MAPE: 3743985841244049.0
MSE: 7.622585499354415
RMSE: 2.7609030224465356
R2_Train: 0.7702008559353023
R2_Test: 0.4967059012277001


R2_Train: 0.7458536102091899
R2_Test: 0.4801538640647933

In [29]:
model = ElasticNet()

parameters = {"alpha": [0.1, 0.5, 1.0],
            "l1_ratio": [0.3, 0.5, 0.7, 0.9],
            "max_iter": [1000, 2000]
}

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [30]:
dtr_gs.fit(X_train, y_train)

In [31]:
best_params = {}
[best_params.update({f"regressor__{key}": [value]}) for key, value in dtr_gs.best_params_.items()]
parameter["en"] = best_params

In [32]:
predictions = dtr_gs.predict(X_test)

In [33]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test, y_test)

In [34]:
score["en"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [35]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 2.9736353787984493
MAPE: 6233272402758712.0
MSE: 12.091615389253688
RMSE: 3.4773000142716604
R2_Train: 0.1907548178871653
R2_Test: 0.20163064480534565


In [36]:
model = SVR()

parameters = {
            "kernel": ["rbf", "poly"],
            "C": [0.5, 1, 2],
            "epsilon": [0.01 ,0.1, 0.3],
}
# ["rbf", "linear", "poly"]
# "C": [0.1],
# "epsilon": [0.01],

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [37]:
dtr_gs.fit(X_train_scal, y_train)

In [38]:
best_params = {}
[best_params.update({f"regressor__{key}": [value]}) for key, value in dtr_gs.best_params_.items()]
parameter["svr"] = best_params

In [39]:
dtr_gs.best_params_

{'C': 2, 'epsilon': 0.01, 'kernel': 'rbf'}

In [40]:
predictions = dtr_gs.predict(X_test_scal)

In [41]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train_scal, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test_scal, y_test)

In [42]:
score["svr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [43]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 2.3649876441186275
MAPE: 5512983597127248.0
MSE: 11.497231143479599
RMSE: 3.390756721364657
R2_Train: 0.3244135830508781
R2_Test: 0.24087586984436404


In [44]:
score

{'plr': {'MAE': 2.8830125049535185,
  'MAPE': 5922824590671005.0,
  'MSE': 11.812982560887304,
  'RMSE': 3.4370019727790826,
  'R2_Train': 0.20923054619125203,
  'R2_Test': 0.2200278485169882},
 'rfr': {'MAE': 1.9505762879326645,
  'MAPE': 3743985841244049.0,
  'MSE': 7.622585499354415,
  'RMSE': 2.7609030224465356,
  'R2_Train': 0.7702008559353023,
  'R2_Test': 0.4967059012277001},
 'en': {'MAE': 2.9736353787984493,
  'MAPE': 6233272402758712.0,
  'MSE': 12.091615389253688,
  'RMSE': 3.4773000142716604,
  'R2_Train': 0.1907548178871653,
  'R2_Test': 0.20163064480534565},
 'svr': {'MAE': 2.3649876441186275,
  'MAPE': 5512983597127248.0,
  'MSE': 11.497231143479599,
  'RMSE': 3.390756721364657,
  'R2_Train': 0.3244135830508781,
  'R2_Test': 0.24087586984436404}}

In [45]:
parameter

{'plr': {'poli__degree': [2]},
 'rfr': {'regressor__max_depth': [13],
  'regressor__min_samples_leaf': [3],
  'regressor__n_estimators': [200]},
 'en': {'regressor__alpha': [0.1],
  'regressor__l1_ratio': [0.3],
  'regressor__max_iter': [1000]},
 'svr': {'regressor__C': [2],
  'regressor__epsilon': [0.01],
  'regressor__kernel': ['rbf']}}

In [46]:
write_yaml("../models/heavy/model_config.yaml", parameter)

YAML file write: OK


True

In [47]:
write_yaml("../models/heavy/score.yaml", score)

YAML file write: OK


True

In [48]:
bee()