# Modeling

## Modules

In [1]:
# Set Global Variables
BASE_PATH='C:\\Users\\jonmc\\Documents\\git\\jonmccallum-okc-datascientist\\'

# Import Modules
import sys
import warnings
import datetime
from math import sqrt
from datetime import datetime

sys.path.insert(0, BASE_PATH + 'notebook\\resources\\')
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

from process import *
from models import *

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import pipeline
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

from joblib import dump

# Analysis Configs

#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', None)

In [2]:
today = datetime.today().strftime('%Y-%m-%d_%I-%M_%p')

## Functions

In [3]:
# Function for saving off pipline object (model)
def saveModel(model, prefix):
    
    modelPath = BASE_PATH + '\\model\\'
    dump(model, modelPath + prefix + '.pipe')

    
# Function for logging experiments    
def writeExperiment(date, summary):
    
    print('Logging experiment...')
    text_file = open(BASE_PATH + f'notebook\\main\\modeling\\experiment_logs\\{today}_experiment.txt', 'a')
    n = text_file.write(summary)
    text_file.close()
    print('DONE!!!\n')

    
def mean_absolute_percentage_error(y_true, y_pred):
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    
# Function for grid searching over a list of models with different parameters
def hypertuneGridSearch(scaler, experimentData, modelDefinitions, numIterations, crossValidation, scoringMethod, RANDOM_STATE):
    
    bestModelsList = []
    
    for item in range(len(modelDefinitions)):
        
        modelDef = modelDefinitions[item]
        
        print(f'Beginning grid search for {modelDef[0]} model definition...')
        
        models = RandomizedSearchCV(estimator=modelDef[1]['estimator']
                                    , param_distributions=modelDef[1]['modelParams']
                                    , n_iter=numIterations
                                    , cv=crossValidation
                                    , scoring=scoringMethod
                                    , verbose=2
                                    , random_state=RANDOM_STATE
                                    , n_jobs=-1
        
                 )
        
        models.fit(experimentData['feature_train'], experimentData['target_train'])
        
        modelBest = models.best_estimator_
        modelBestParams = models.best_params_
        modelBestScore = models.best_score_
        
        bestModelsList.append(modelBest)
                
        trainMAPE = mean_absolute_percentage_error(experimentData['target_train'], modelBest.predict(experimentData['feature_train']))
        testMAPE = mean_absolute_percentage_error(experimentData['target_test'], modelBest.predict(experimentData['feature_test']))
        rSquared = r2_score(experimentData['target_test'], modelBest.predict(experimentData['feature_test']))
        
        experimentResults = ('-- MODEL RESULTS -- \n'
                             f'Tuned Model: {modelDef[0]}\n'
                             f'Best Parameters: {modelBestParams}\n'
                             f'Best Cross Validation Score: {modelBestScore}\n'
                             f'MAPE on Train Set: {trainMAPE}\n'
                             f'MAPE on Test Set: {testMAPE}\n'
                             f'R2 on Test Set: {rSquared}\n\n')
        
        print(experimentResults)
        
        # Log experiment results
        writeExperiment(today, experimentResults)
        
        modelPipe = make_pipeline(scaler, modelBest)
        modelPipe.fit(experimentData['original_features'], experimentData['original_target'])
        saveModel(modelPipe, modelPipe.steps[1][0])
        
        return bestModelsList
        
        
        

## Read Prepped Sample

In [4]:
print(getDir())

regressData = readCSV(BASE_PATH + 'data\\processed\\challenge1_processed.csv')

C:\Users\jonmc\Documents\git\jonmccallum-okc-datascientist\notebook\main\modeling


## Split Data For Training / Testing & Target / Features

In [5]:
features = regressData.drop(columns=['price'], axis=1)

target = regressData[['price']]

In [6]:
# Set holdout set
SPLIT_SIZE = 0.25

X_train, X_test, y_train, y_test = train_test_split(
                                                     features
                                                     , target
                                                     , random_state=RANDOM_STATE
                                                     , test_size=SPLIT_SIZE

                                    )


X_train_chkpt = X_train
X_test_chkpt = X_test

experimentData = {'original_features': features
                  , 'original_target': target
                  , 'feature_train': scaleDataFrame(X_train)
                  , 'feature_test': scaleDataFrame(X_test)
                  , 'target_train': y_train.to_numpy()
                  , 'target_test': y_test.to_numpy()}


scaler = StandardScaler(with_mean=True, with_std=True)

## Create Regression Model
- Using this as a guiding light for grid search models.

In [7]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(experimentData['feature_train'], experimentData['target_train'])

pred = reg.predict(experimentData['feature_test'])
predDF = pd.DataFrame(pred, columns=["Prediction"])

predDF = pd.concat([predDF , y_test])
predDF.head(5)


trainMAPE = mean_absolute_percentage_error(experimentData['target_train']
                                           , reg.predict(experimentData['feature_train']))

testMAPE = mean_absolute_percentage_error(experimentData['target_test']
                                          , reg.predict(experimentData['feature_test']))

rSquared = r2_score(experimentData['target_test']
                    , reg.predict(experimentData['feature_test']))

experimentResults = ('-- MODEL RESULTS -- \n'
                     f'Tuned Model: linear-regression\n'
                     f'MAPE on Train Set: {trainMAPE}\n'
                     f'MAPE on Test Set: {testMAPE}\n'
                     f'R2 on Test Set: {rSquared}\n\n')

print(experimentResults)

-- MODEL RESULTS -- 
Tuned Model: linear-regression
MAPE on Train Set: 60.736924754969735
MAPE on Test Set: 62.84170578802427
R2 on Test Set: 0.08940417634034514




## Run Tuning

In [11]:
bestModels = hypertuneGridSearch(scaler
                                 , experimentData
                                 , modelDefinitions
                                 , 500
                                 , 3
                                 , 'neg_root_mean_squared_error'
                                 , RANDOM_STATE)

Beginning grid search for elastic_net model definition...
Fitting 3 folds for each of 500 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.0s


-- MODEL RESULTS -- 
Tuned Model: elastic_net
Best Parameters: {'max_iter': 5000, 'l1_ratio': 0.001, 'alpha': 0.5}
Best Cross Validation Score: -278.58345853372293
MAPE on Train Set: 63.14739905210415
MAPE on Test Set: 64.29219490715899
R2 on Test Set: 0.05944764739444541


Logging experiment...
DONE!!!



[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:    8.5s finished
