In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Load data

def load_data(path: str = '/data/data.csv'):
    """
    This function takes the path to CSV and loads it into a pandas data frame.
    
    :param       path (optional): str
    
    :return     df: pd.DataFrame    
    """
    df = pd.read_csv(f'{path}')
    
    return df


# Create target variable and predictor variables

def create_target_and_predictors( data: pd.DataFrame = None, target: str = 'estimated_stock_percentage'):
    
    """
    This function takes in a pandas data frame and splits the target variable (y) and x (predictor variables)
    for training the random forest model.
    
    :param       data: pd.DataFrame, a data frame containing data for training the model
    
    :param       target: str(optional), the target variable, i.e. the variable we want to predict
    
    :return      X: pd.DataFrame
                 y: pd.Series
    
    """
    
    if target not in data.columns:
        raise Exception(f'Target: {target} is not present in the data')
        
    X = data.drop(columns=[target])
    y = data[target]
    
    return X, y

# Train models with hyperparameter tuning and cross-validation to determine the best hyperparameters

def train_model_cv_hyperparameter_tuning(X: pd.DataFrame = None, y: pd.Series = None):
    """
    This function takes in the predictor and target variables and train a Random Forest Regressor model
    with cross-validation and hyperparameter tuning. The result will be the hyperparameters that produce
    the model with best performance. These hyperparameters will then be used to train a model to produce
    the model to be deployed.
    
    :param       X: pd.DataFrame, predictor variables
    :param       y:pd.Series, target variable
    
    :return      rf_model, the Random Forest regressor model
    
    """
    
    rf = RandomForestRegressor()
    cv = KFold(n_splits=5)
    param_grid ={'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7]}

    grid_search = GridSearchCV(rf, param_grid=param_grid, cv=cv)
    grid_search.fit(X, y)
    
    print('MEAN CROSS-VALIDATION SCORE: ', grid_search.best_score_)
    print('BEST HYPERPARAMETERS: ', grid_search.best_params_)
    print('Finished with hyperparamer tuning with cross-validation. Now on to training the model with the best hyperparameters.')
    n_estimators = grid_search.best_params_['n_estimators']
    max_depth = grid_search.best_params_['max_depth']

    rf_model = train_model(X, y, n_estimators, max_depth)

    return rf_model


# Train the model with hyperparameters producing the best performance based on cross-validation results

def train_rf(X, y, n_estimators, max_depth):
    """
    This function takes in the predictor variales (X), the target variable (y), n_estimators parameter, and max_depth parameters.
    n_estimators and max_depth hyperparameters will be obtained
    from the calculations inside train_model_cv_hyperparameter_tuning function where this function will be run.
    The output of this function are a model object that has been trained and the model object that is saved as pickle file.
    
    :param       X: pd.DataFrame, predictor variables
    :param       y: pd.Series, target variable
    :param       n_estimators, a random forest model hyperparameter
    :param       max_depth, a random forest model hyperparameter
    :return      rf_model
    
    """
    
    rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=12)
    rf_model = rf.fit(X_train, y_train)

    y_pred = rf_model.predict(X_test)
    mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)

    print('MEAN ABSOLUTE ERROR OF TRAINED BEST MODEL: ', mae)

    pickle.dump(rf_model, open(rf_model_file, 'wb'))
    
    return rf_model

