In [47]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 

In [48]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from time import sleep
from winsound import Beep
from pathlib import Path
import yaml

In [49]:
def write_yaml(file_path: str, data: object):
    """
    Write data to a YAML file.

    Args:
        file_path (str): Path of the YAML file.
        data (object): Data to write.

    Returns:
        bool: True if the file was written successfully, False otherwise.
    """
    try:
        with open(Path(file_path), "w") as f:
            yaml.dump(data, f)
        print("YAML file write: OK")
        return True
    except (FileNotFoundError, IOError, yaml.YAMLError) as err:
        print(f"Failed to write YAML file {file_path}: {err}")
        return False

In [50]:
def bee(r = 3, f = 2500, d = 1000, p = 1):
    for n in range(0,r):
        Beep(f, d)
        sleep(p)


In [51]:
parameter = {}
score = {}

In [52]:
medium_df = pd.read_csv("../data/processed/medium.csv", index_col="Unnamed: 0")

In [53]:
medium_df.columns

Index(['demography_score', 'type_score', 'score', 'read', 'book_status_score',
       'pending', 'following', 'favorite', 'have', 'abandoned',
       'chapters_count'],
      dtype='object')

In [54]:
#
#
#

In [55]:
X = medium_df.drop("score", axis=1)
y = medium_df["score"]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [57]:
# Creo el scaler con los datos de train
scal = StandardScaler() # Declaro el scaler
scal.fit(X_train) # Lo "entreno". Calculo su media y std para cada feature
X_train_scal = scal.transform(X_train) # Aplico el scaler y sobreescribo los datos de train

# Aplico el mismo scaler con los datos de test
X_test_scal = scal.transform(X_test)

# Si quiero recuperar la anterior escala
#X_train = scal.inverse_transform(X_train)
#print(X_train[0])

In [58]:
poly_feats = PolynomialFeatures(degree = 2)
poly_feats.fit(X)
X_poly = poly_feats.transform(X)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [59]:
parameter["plr"] = {"poli__degree": [2]}

In [60]:
predictions = lin_reg.predict(X_test)

In [61]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = lin_reg.score(X_train, y_train)
R2_Test = lin_reg.score(X_test, y_test)

In [62]:
score["plr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [63]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 2.912053957349369
MAPE: 6026549252084822.0
MSE: 11.974818219210182
RMSE: 3.460465029329177
R2_Train: 0.1926207635809818
R2_Test: 0.20934237548270973


In [64]:
model = RandomForestRegressor(random_state=10)

parameters = {"n_estimators":[150,200],
            "max_depth":[13],
            "min_samples_leaf": [2,3,4],
            "max_features": [6]
}

# "criterion": ['absolute_error'],
# "min_samples_split": [2,5,7],
# "max_features": [6]

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [65]:
dtr_gs.fit(X_train, y_train)

In [66]:
best_params = {}
[best_params.update({f"regressor__{key}": [value]}) for key, value in dtr_gs.best_params_.items()]
parameter["rfr"] = best_params

In [67]:
feature_importances = dtr_gs.best_estimator_.feature_importances_
feature_importances_dict = dict(zip(X.columns, feature_importances))
feature_importances_sort = sorted(feature_importances_dict.items(), key=lambda x:x[1], reverse=True)

In [68]:
feature_importances_sort

[('read', 0.1838175867972499),
 ('pending', 0.16955054982043563),
 ('book_status_score', 0.1290499559546714),
 ('have', 0.11554290308878244),
 ('favorite', 0.10834207780132785),
 ('chapters_count', 0.09797045095438345),
 ('following', 0.08145287748138036),
 ('abandoned', 0.06628500905118506),
 ('demography_score', 0.030480852859098597),
 ('type_score', 0.017507736191485334)]

In [69]:
predictions = dtr_gs.predict(X_test)

In [70]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test, y_test)

In [71]:
score["rfr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [72]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 1.9382431358685903
MAPE: 3723503654012027.0
MSE: 7.562777014095037
RMSE: 2.750050365737878
R2_Train: 0.7827805194588969
R2_Test: 0.5006548576139689


In [73]:
model = ElasticNet()

parameters = {"alpha": [0.1, 0.5, 1.0],
            "l1_ratio": [0.3, 0.5, 0.7, 0.9],
            "max_iter": [1000, 2000]
}

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [74]:
dtr_gs.fit(X_train, y_train)

In [75]:
best_params = {}
[best_params.update({f"regressor__{key}": [value]}) for key, value in dtr_gs.best_params_.items()]
parameter["en"] = best_params

In [76]:
predictions = dtr_gs.predict(X_test)

In [77]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test, y_test)

In [78]:
score["en"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [79]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 2.977799507775082
MAPE: 6243905598264410.0
MSE: 12.11966151984771
RMSE: 3.481330423824735
R2_Train: 0.18657840789451774
R2_Test: 0.19977885160177133


In [80]:
model = SVR()

parameters = {
            "kernel": ["rbf", "poly"],
            "C": [0.5, 1, 2],
            "epsilon": [0.01 ,0.1, 0.3],
}
# ["rbf", "linear", "poly"]
# "C": [0.1],
# "epsilon": [0.01],

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [81]:
dtr_gs.fit(X_train_scal, y_train)

In [82]:
best_params = {}
[best_params.update({f"regressor__{key}": [value]}) for key, value in dtr_gs.best_params_.items()]
parameter["svr"] = best_params

In [83]:
dtr_gs.best_params_

{'C': 2, 'epsilon': 0.01, 'kernel': 'rbf'}

In [84]:
predictions = dtr_gs.predict(X_test_scal)

In [85]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train_scal, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test_scal, y_test)

In [86]:
score["svr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [87]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 2.02579360490439
MAPE: 4172736768388032.5
MSE: 10.813288962091443
RMSE: 3.288356574657232
R2_Train: 0.2715105289271481
R2_Test: 0.286034309041047


R2_Train: 0.1265946515779437
R2_Test: 0.10453273455299872

In [88]:
score

{'plr': {'MAE': 2.912053957349369,
  'MAPE': 6026549252084822.0,
  'MSE': 11.974818219210182,
  'RMSE': 3.460465029329177,
  'R2_Train': 0.1926207635809818,
  'R2_Test': 0.20934237548270973},
 'rfr': {'MAE': 1.9382431358685903,
  'MAPE': 3723503654012027.0,
  'MSE': 7.562777014095037,
  'RMSE': 2.750050365737878,
  'R2_Train': 0.7827805194588969,
  'R2_Test': 0.5006548576139689},
 'en': {'MAE': 2.977799507775082,
  'MAPE': 6243905598264410.0,
  'MSE': 12.11966151984771,
  'RMSE': 3.481330423824735,
  'R2_Train': 0.18657840789451774,
  'R2_Test': 0.19977885160177133},
 'svr': {'MAE': 2.02579360490439,
  'MAPE': 4172736768388032.5,
  'MSE': 10.813288962091443,
  'RMSE': 3.288356574657232,
  'R2_Train': 0.2715105289271481,
  'R2_Test': 0.286034309041047}}

In [89]:
parameter

{'plr': {'poli__degree': [2]},
 'rfr': {'regressor__max_depth': [13],
  'regressor__max_features': [6],
  'regressor__min_samples_leaf': [2],
  'regressor__n_estimators': [200]},
 'en': {'regressor__alpha': [0.1],
  'regressor__l1_ratio': [0.3],
  'regressor__max_iter': [1000]},
 'svr': {'regressor__C': [2],
  'regressor__epsilon': [0.01],
  'regressor__kernel': ['rbf']}}

In [90]:
write_yaml("../models/medium/model_config.yaml", parameter)

YAML file write: OK


True

In [91]:
write_yaml("../models/medium/score.yaml", score)

YAML file write: OK


True

In [92]:
bee()