#### Importing Libraries and Reading Processed Data

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import re
import warnings
warnings.filterwarnings("ignore")
import time
import os
import joblib

Load Config

In [None]:
import yaml

#read yaml file
with open('../config/config.yaml') as file:
  config= yaml.safe_load(file)
  #print(config)

## Load the cleaned Dataset

Get root dir

In [None]:
#Get current working dir
cwd_path=os.getcwd()
#print(cwd_path)

#  Get the path of ROOT DIR
path_list = cwd_path.split(os.sep) #split path
ROOT_PATH=os.path.join(path_list[0],os.sep,*path_list[1:-2]) # To move two level up, remove last two str and join to create root path
#print(ROOT_PATH)

Read the clean data

In [None]:
# process Data file
Process_Data_file=os.path.join(ROOT_PATH,config['processed_data_dir'],config['processed_data_file'])
print(Process_Data_file)

# Read the Sales input file name
try:
    Data=pd.read_csv(Process_Data_file)
    #sales_data.head()
except FileNotFoundError:
    print("file {} does not exist".format(Process_Data_file))



In [None]:
Data

#### Train test split

Separating independent features and target feature

In [None]:
X= Data.drop(['SalesInThousandDollars'],axis=1)

In [None]:
X

In [None]:
y= Data['SalesInThousandDollars']

In [None]:
y

Splitting data intp train-test with 75% and 25% ratio

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle = False)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
X_train.dtypes

#### Model Building

XGBoost Model

In [None]:
regressor = XGBRegressor(random_state=42)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
print(mean_absolute_percentage_error(y_test,y_pred)*100)

Hyper-parameter tuning the XGBoost Model with CrossValidation

In [None]:
model.get_params()

In [None]:
param_grid = dict(
    n_jobs=[16],
    learning_rate=[0.1, 0.5],
    objective=['reg:squarederror'],
    max_depth=[5, 10, 15], 
    n_estimators=[100, 200, 500],
    subsample=[0.2, 0.8, 1.0],
    gamma=[0.05, 0.5],
    scale_pos_weight=[0, 1],
    reg_alpha=[0, 0.5],
    reg_lambda=[1, 0],
)

In [None]:
model = XGBRegressor(random_state=42, verbosity=1)

grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_percentage_error',
                           )

In [None]:
%%time
best_model = grid_search.fit(X_train, y_train)
print('Optimum parameters', best_model.best_params_)

In [None]:
ModelXGBregressor = XGBRegressor(gamma=0.5, learning_rate= 0.1, max_depth= 5, n_estimators= 100,
                         n_jobs=16, objective='reg:squarederror', reg_alpha= 0.5, reg_lambda= 1, 
                         scale_pos_weight=0, subsample=1.0,random_state=42)

In [None]:
TunedXGBmodel = ModelXGBregressor.fit(X_train, y_train)
tuned_xgboost_pred = TunedXGBmodel.predict(X_test)

In [None]:
print(mean_absolute_percentage_error(y_test,tuned_xgboost_pred)*100)

#### Saving the model

In [None]:
# training model filename
training_model=os.path.join(ROOT_PATH,config['models_dir'],config['training_model'])
#print(training_model)

In [None]:
# save the model 
import pickle
pickle.dump(TunedXGBmodel, open(training_model, 'wb'))