In [47]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 

In [48]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from time import sleep
from winsound import Beep
from pathlib import Path
import yaml

In [49]:
def write_yaml(file_path: str, data: object):
    """
    Write data to a YAML file.

    Args:
        file_path (str): Path of the YAML file.
        data (object): Data to write.

    Returns:
        bool: True if the file was written successfully, False otherwise.
    """
    try:
        with open(Path(file_path), "w") as f:
            yaml.dump(data, f)
        print("YAML file write: OK")
        return True
    except (FileNotFoundError, IOError, yaml.YAMLError) as err:
        print(f"Failed to write YAML file {file_path}: {err}")
        return False

In [50]:
def bee(r = 3, f = 2500, d = 1000, p = 1):
    for n in range(0,r):
        Beep(f, d)
        sleep(p)


In [51]:
parameter = {}
score = {}

In [52]:
light_df = pd.read_csv("../data/processed/light.csv", index_col="Unnamed: 0")

In [53]:
light_df.columns

Index(['score', 'read', 'pending', 'following', 'favorite', 'have',
       'abandoned'],
      dtype='object')

In [54]:
#
#
#

In [55]:
X = light_df.drop("score", axis=1)
y = light_df["score"]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [57]:
# Creo el scaler con los datos de train
scal = StandardScaler() # Declaro el scaler
scal.fit(X_train) # Lo "entreno". Calculo su media y std para cada feature
X_train_scal = scal.transform(X_train) # Aplico el scaler y sobreescribo los datos de train

# Aplico el mismo scaler con los datos de test
X_test_scal = scal.transform(X_test)

# Si quiero recuperar la anterior escala
#X_train = scal.inverse_transform(X_train)
#print(X_train[0])

In [58]:
poly_feats = PolynomialFeatures(degree = 2)
poly_feats.fit(X)
X_poly = poly_feats.transform(X)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [59]:
parameter["plr"] = {"poli__degree": 2}

In [60]:
predictions = lin_reg.predict(X_test)

In [61]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = lin_reg.score(X_train, y_train)
R2_Test = lin_reg.score(X_test, y_test)

In [62]:
score["plr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [63]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 3.246706943595827
MAPE: 7004095373389747.0
MSE: 13.799166071534625
RMSE: 3.7147228795072484
R2_Train: 0.09087004581369929
R2_Test: 0.0888867232292091


R2_Train: 0.09087004581369906
R2_Test: 0.0888867232292091

In [64]:
model = RandomForestRegressor(random_state=10)

parameters = {"n_estimators":[150,200],
            "max_depth":[13],
            "min_samples_leaf": [2,3,4],
            "max_features": [6]
}

# "criterion": ['absolute_error'],
# "min_samples_split": [2,5,7],

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [65]:
dtr_gs.fit(X_train, y_train)

In [66]:
best_params = {}
[best_params.update({f"regressor__{key}": value}) for key, value in dtr_gs.best_params_.items()]
parameter["rfr"] = best_params

In [67]:
feature_importances = dtr_gs.best_estimator_.feature_importances_
feature_importances_dict = dict(zip(X.columns, feature_importances))
feature_importances_sort = sorted(feature_importances_dict.items(), key=lambda x:x[1], reverse=True)

In [68]:
feature_importances_sort

[('read', 0.3206341519710045),
 ('pending', 0.2104954006991117),
 ('have', 0.17143599168985102),
 ('favorite', 0.15412935629268562),
 ('abandoned', 0.07765032216647938),
 ('following', 0.06565477718086776)]

In [23]:
predictions = dtr_gs.predict(X_test)

In [24]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test, y_test)

In [25]:
score["rfr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [26]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 1.9697344492988944
MAPE: 3765549091930910.0
MSE: 7.873361350932763
RMSE: 2.8059510599675046
R2_Train: 0.7458270335035304
R2_Test: 0.48014800151969805


R2_Train: 0.7458536102091899
R2_Test: 0.4801538640647933

In [27]:
model = ElasticNet()

parameters = {"alpha": [0.1, 0.5, 1.0],
            "l1_ratio": [0.3, 0.5, 0.7, 0.9],
            "max_iter": [1000, 2000]
}

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [28]:
dtr_gs.fit(X_train, y_train)

In [29]:
best_params = {}
[best_params.update({f"regressor__{key}": value}) for key, value in dtr_gs.best_params_.items()]
parameter["en"] = best_params

In [30]:
predictions = dtr_gs.predict(X_test)

In [31]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test, y_test)

In [32]:
score["en"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [33]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 3.246595346289047
MAPE: 7004934840557768.0
MSE: 13.798854211874849
RMSE: 3.71468090310256
R2_Train: 0.09086831420192742
R2_Test: 0.08890731429065712


R2_Train: 0.07813377866282745
R2_Test: 0.07713350615140602

In [34]:
model = SVR()

parameters = {
            "kernel": ["rbf", "poly"],
            "C": [0.5, 1, 2],
            "epsilon": [0.01 ,0.1, 0.3],
}
# ["rbf", "linear", "poly"]
# "C": [0.1],
# "epsilon": [0.01],

dtr_gs = GridSearchCV(model, parameters, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)

In [35]:
dtr_gs.fit(X_train_scal, y_train)

In [36]:
best_params = {}
[best_params.update({f"regressor__{key}": value}) for key, value in dtr_gs.best_params_.items()]
parameter["svr"] = best_params

In [37]:
dtr_gs.best_params_

{'C': 2, 'epsilon': 0.3, 'kernel': 'rbf'}

In [38]:
predictions = dtr_gs.predict(X_test_scal)

In [39]:
MAE = mean_absolute_error(y_test, predictions)
MAPE = mean_absolute_percentage_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = np.sqrt(mean_squared_error(y_test, predictions))
R2_Train = dtr_gs.best_estimator_.score(X_train_scal, y_train)
R2_Test = dtr_gs.best_estimator_.score(X_test_scal, y_test)

In [40]:
score["svr"] = {"MAE": float(MAE), "MAPE": float(MAPE), "MSE": float(MSE), "RMSE": float(RMSE), "R2_Train": float(R2_Train), "R2_Test": float(R2_Test)}

In [41]:
print("MAE:", MAE)
print("MAPE:", MAPE)
print("MSE:", MSE)
print("RMSE:", RMSE)
print("R2_Train:", R2_Train)
print("R2_Test:", R2_Test)

MAE: 2.6165454518650915
MAPE: 7929876991934449.0
MSE: 13.562201125332443
RMSE: 3.6826893875715943
R2_Train: 0.1265946515779437
R2_Test: 0.10453273455299872


R2_Train: 0.1265946515779437
R2_Test: 0.10453273455299872

In [42]:
score

{'plr': {'MAE': 3.246706943595827,
  'MAPE': 7004095373389747.0,
  'MSE': 13.799166071534625,
  'RMSE': 3.7147228795072484,
  'R2_Train': 0.09087004581369929,
  'R2_Test': 0.0888867232292091},
 'rfr': {'MAE': 1.9697344492988944,
  'MAPE': 3765549091930910.0,
  'MSE': 7.873361350932763,
  'RMSE': 2.8059510599675046,
  'R2_Train': 0.7458270335035304,
  'R2_Test': 0.48014800151969805},
 'en': {'MAE': 3.246595346289047,
  'MAPE': 7004934840557768.0,
  'MSE': 13.798854211874849,
  'RMSE': 3.71468090310256,
  'R2_Train': 0.09086831420192742,
  'R2_Test': 0.08890731429065712},
 'svr': {'MAE': 2.6165454518650915,
  'MAPE': 7929876991934449.0,
  'MSE': 13.562201125332443,
  'RMSE': 3.6826893875715943,
  'R2_Train': 0.1265946515779437,
  'R2_Test': 0.10453273455299872}}

In [43]:
parameter

{'plr': {'poli__degree': 2},
 'rfr': {'regressor__max_depth': 13,
  'regressor__max_features': 6,
  'regressor__min_samples_leaf': 3,
  'regressor__n_estimators': 200},
 'en': {'regressor__alpha': 1.0,
  'regressor__l1_ratio': 0.9,
  'regressor__max_iter': 1000},
 'svr': {'regressor__C': 2,
  'regressor__epsilon': 0.3,
  'regressor__kernel': 'rbf'}}

In [44]:
write_yaml("../models/light/model_config.yaml", parameter)

YAML file write: OK


True

In [45]:
write_yaml("../models/light/score.yaml", score)

YAML file write: OK


True

In [46]:
bee()