In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [None]:
#Importing data frames. 
df = pd.read_csv("C:\\Users\\mawal\\OneDrive - Binghamton University\\Desktop\\My Papers\\Papers\\Forecasting_2020\\Pres_2.csv") #Forecasting variables
df_berry = pd.read_csv("C:\\Users\\mawal\\OneDrive - Binghamton University\\Desktop\\My Papers\\Papers\\Forecasting_2020\\stateideology_v2018_2.csv") #Berry's ideology scores
df_pop = pd.read_csv("C:\\Users\\mawal\\OneDrive - Binghamton University\\Desktop\\My Papers\\Papers\\Forecasting_2020\\population_data.csv") #State population data

In [None]:
#Need to merge both population and berry ideology with electoral variables 
df = pd.merge(df, df_berry, how = 'outer') #Keep all observations
df = pd.merge(df, df_pop, how = 'outer') #Keep all observations 

In [None]:
#Adding regional, moralistic, and progressive history controls
df_new = df[df['year'] > 1950]
df_new['region_ne'] = 0
df_new['region_mw'] = 0
df_new['region_s'] = 0
df_new['region_w'] = 0

df_new['solidsouth'] = 0
df_new['external'] = 0
df_new['mountain'] = 0
df_new['pacific'] = 0
df_new['newengland'] = 0
df_new['middleatlantic'] = 0
df_new['eastnorth'] = 0
df_new['westnorth'] = 0
df_new['border'] = 0

df_new['prog'] = 0
df_new['trad'] = 0
df_new['moral'] = 0
df_new['ind'] = 0

states = np.unique(df_new['statename'])


In [None]:
#Adding regional controls in combination with the state names.
df_new.loc[df_new['statename'] == 'Alabama', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Arkansas', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Delaware', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Dist. of Col.', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Florida', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Georgia', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Kentucky', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Louisiana', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Maryland', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Mississippi', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Missouri', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'North Carolina', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Oklahoma', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'South Carolina', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Tennessee', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Texas', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'Virginia', 'region_s'] = 1
df_new.loc[df_new['statename'] == 'West Virginia', 'region_s'] = 1

df_new.loc[df_new['statename'] == 'Connecticut', 'region_ne'] = 1
df_new.loc[df_new['statename'] == 'Maine', 'region_ne'] = 1
df_new.loc[df_new['statename'] == 'Massachusetts', 'region_ne'] = 1
df_new.loc[df_new['statename'] == 'New Hampshire', 'region_ne'] = 1
df_new.loc[df_new['statename'] == 'New Jersey', 'region_ne'] = 1
df_new.loc[df_new['statename'] == 'New York', 'region_ne'] = 1
df_new.loc[df_new['statename'] == 'Pennsylvania', 'region_ne'] = 1
df_new.loc[df_new['statename'] == 'Rhode Island', 'region_ne'] = 1
df_new.loc[df_new['statename'] == 'Vermont', 'region_ne'] = 1

df_new.loc[df_new['statename'] == 'Illinois', 'region_mw'] = 1
df_new.loc[df_new['statename'] == 'Indiana', 'region_mw'] = 1
df_new.loc[df_new['statename'] == 'Iowa', 'region_mw'] = 1
df_new.loc[df_new['statename'] == 'Kansas', 'region_mw'] = 1
df_new.loc[df_new['statename'] == 'Michigan', 'region_mw'] = 1
df_new.loc[df_new['statename'] == 'Minnesota', 'region_mw'] = 1
df_new.loc[df_new['statename'] == 'Nebraska', 'region_mw'] = 1
df_new.loc[df_new['statename'] == 'North Dakota', 'region_mw'] = 1
df_new.loc[df_new['statename'] == 'Ohio', 'region_mw'] = 1
df_new.loc[df_new['statename'] == 'South Dakota', 'region_mw'] = 1
df_new.loc[df_new['statename'] == 'Wisconsin', 'region_mw'] = 1

df_new.loc[df_new['statename'] == 'Alaska', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'Arizona', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'California', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'Colorado', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'Hawaii', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'Montana', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'Idaho', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'Nevada', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'New Mexico', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'Oregon', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'Utah', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'Washington', 'region_w'] = 1
df_new.loc[df_new['statename'] == 'Wyoming', 'region_w'] = 1

In [None]:
#Extending regional controls to 9 regions instead of 4.
# df_new.loc[df_new['statename'] == 'Alabama', 'solidsouth'] = 1
# df_new.loc[df_new['statename'] == 'Arkansas', 'solidsouth'] = 1
# df_new.loc[df_new['statename'] == 'Delaware', 'middleatlantic'] = 1
# df_new.loc[df_new['statename'] == 'Dist. of Col.', 'border'] = 1
# df_new.loc[df_new['statename'] == 'Florida', 'solidsouth'] = 1
# df_new.loc[df_new['statename'] == 'Georgia', 'solidsouth'] = 1
# df_new.loc[df_new['statename'] == 'Kentucky', 'border'] = 1
# df_new.loc[df_new['statename'] == 'Louisiana', 'solidsouth'] = 1
# df_new.loc[df_new['statename'] == 'Maryland', 'border'] = 1
# df_new.loc[df_new['statename'] == 'Mississippi', 'solidsouth'] = 1
# df_new.loc[df_new['statename'] == 'Missouri', 'westnorth'] = 1
# df_new.loc[df_new['statename'] == 'North Carolina', 'solidsouth'] = 1
# df_new.loc[df_new['statename'] == 'Oklahoma', 'border'] = 1
# df_new.loc[df_new['statename'] == 'South Carolina', 'solidsouth'] = 1
# df_new.loc[df_new['statename'] == 'Tennessee', 'border'] = 1
# df_new.loc[df_new['statename'] == 'Texas', 'solidsouth'] = 1
# df_new.loc[df_new['statename'] == 'Virginia', 'solidsouth'] = 1
# df_new.loc[df_new['statename'] == 'West Virginia', 'border'] = 1

# df_new.loc[df_new['statename'] == 'Connecticut', 'newengland'] = 1
# df_new.loc[df_new['statename'] == 'Maine', 'newengland'] = 1
# df_new.loc[df_new['statename'] == 'Massachusetts', 'newengland'] = 1
# df_new.loc[df_new['statename'] == 'New Hampshire', 'newengland'] = 1
# df_new.loc[df_new['statename'] == 'New Jersey', 'middleatlantic'] = 1
# df_new.loc[df_new['statename'] == 'New York', 'middleatlantic'] = 1
# df_new.loc[df_new['statename'] == 'Pennsylvania', 'middleatlantic'] = 1
# df_new.loc[df_new['statename'] == 'Rhode Island', 'newengland'] = 1
# df_new.loc[df_new['statename'] == 'Vermont', 'newengland'] = 1

# df_new.loc[df_new['statename'] == 'Illinois', 'eastnorth'] = 1
# df_new.loc[df_new['statename'] == 'Indiana', 'eastnorth'] = 1
# df_new.loc[df_new['statename'] == 'Iowa', 'westnorth'] = 1
# df_new.loc[df_new['statename'] == 'Kansas', 'westnorth'] = 1
# df_new.loc[df_new['statename'] == 'Michigan', 'eastnorth'] = 1
# df_new.loc[df_new['statename'] == 'Minnesota', 'westnorth'] = 1
# df_new.loc[df_new['statename'] == 'Nebraska', 'westnorth'] = 1
# df_new.loc[df_new['statename'] == 'North Dakota', 'westnorth'] = 1
# df_new.loc[df_new['statename'] == 'Ohio', 'eastnorth'] = 1
# df_new.loc[df_new['statename'] == 'South Dakota', 'westnorth'] = 1
# df_new.loc[df_new['statename'] == 'Wisconsin', 'eastnorth'] = 1

# df_new.loc[df_new['statename'] == 'Alaska', 'external'] = 1
# df_new.loc[df_new['statename'] == 'Arizona', 'mountain'] = 1
# df_new.loc[df_new['statename'] == 'California', 'pacific'] = 1
# df_new.loc[df_new['statename'] == 'Colorado', 'mountain'] = 1
# df_new.loc[df_new['statename'] == 'Hawaii', 'external'] = 1
# df_new.loc[df_new['statename'] == 'Montana', 'mountain'] = 1
# df_new.loc[df_new['statename'] == 'Idaho', 'mountain'] = 1
# df_new.loc[df_new['statename'] == 'Nevada', 'mountain'] = 1
# df_new.loc[df_new['statename'] == 'New Mexico', 'mountain'] = 1
# df_new.loc[df_new['statename'] == 'Oregon', 'pacific'] = 1
# df_new.loc[df_new['statename'] == 'Utah', 'mountain'] = 1
# df_new.loc[df_new['statename'] == 'Washington', 'pacific'] = 1
# df_new.loc[df_new['statename'] == 'Wyoming', 'mountain'] = 1

In [None]:
#Changing the index to begin at 0
df_new.index = np.arange(0, len(df_new))
df_new.index

In [None]:
df_new.shape

In [None]:
# #If we want to add state fixed effects
# for i in range(len(df)):
#     if df2.loc[i]["statename"]== 'Alabama':
#         df2.iloc[[i],[31]] = 1
#     elif df2.loc[i]["statename"]== 'Alaska':
#         df2.iloc[[i],[32]] = 1
#     elif df2.loc[i]["statename"]== 'Arizona':
#         df2.iloc[[i],[33]] = 1
#     elif df2.loc[i]["statename"]== 'Arkansas':
#         df2.iloc[[i],[34]] = 1
#     elif df2.loc[i]["statename"]== 'California':
#         df2.iloc[[i],[35]] = 1
#     elif df2.loc[i]["statename"]== 'Colorado':
#         df2.iloc[[i],[36]] = 1
#     elif df2.loc[i]["statename"]== 'Connecticut':
#         df2.iloc[[i],[37]] = 1
#     elif df2.loc[i]["statename"]== 'Delaware':
#         df2.iloc[[i],[38]] = 1
#     elif df2.loc[i]["statename"]== 'Florida':
#         df2.iloc[[i],[39]] = 1
#     elif df2.loc[i]["statename"]== 'Georgia':
#         df2.iloc[[i],[40]] = 1
#     elif df2.loc[i]["statename"]== 'Hawaii':
#         df2.iloc[[i],[41]] = 1
#     elif df2.loc[i]["statename"]== 'Idaho':
#         df2.iloc[[i],[42]] = 1
#     elif df2.loc[i]["statename"]== 'Illinois':
#         df2.iloc[[i],[43]] = 1
#     elif df2.loc[i]["statename"]== 'Indiana':
#         df2.iloc[[i],[44]] = 1
#     elif df2.loc[i]["statename"]== 'Iowa':
#         df2.iloc[[i],[45]] = 1
#     elif df2.loc[i]["statename"]== 'Kansas':
#         df2.iloc[[i],[46]] = 1
#     elif df2.loc[i]["statename"]== 'Kentucky':
#         df2.iloc[[i],[47]] = 1
#     elif df2.loc[i]["statename"]== 'Louisiana':
#         df2.iloc[[i],[48]] = 1
#     elif df2.loc[i]["statename"]== 'Maine':
#         df2.iloc[[i],[49]] = 1
#     elif df2.loc[i]["statename"]== 'Maryland':
#         df2.iloc[[i],[50]] = 1
#     elif df2.loc[i]["statename"]== 'Massachusetts':
#         df2.iloc[[i],[51]] = 1
#     elif df2.loc[i]["statename"]== 'Michigan':
#         df2.iloc[[i],[52]] = 1
#     elif df2.loc[i]["statename"]== 'Minnesota':
#         df2.iloc[[i],[53]] = 1
#     elif df2.loc[i]["statename"]== 'Mississippi':
#         df2.iloc[[i],[54]] = 1
#     elif df2.loc[i]["statename"]== 'Missouri':
#         df2.iloc[[i],[55]] = 1
#     elif df2.loc[i]["statename"]== 'Montana':
#         df2.iloc[[i],[56]] = 1
#     elif df2.loc[i]["statename"]== 'Nebraska':
#         df2.iloc[[i],[57]] = 1
#     elif df2.loc[i]["statename"]== 'Nevada':
#         df2.iloc[[i],[58]] = 1
#     elif df2.loc[i]["statename"]== 'New Hampshire':
#         df2.iloc[[i],[59]] = 1
#     elif df2.loc[i]["statename"]== 'New Jersey':
#         df2.iloc[[i],[60]] = 1
#     elif df2.loc[i]["statename"]== 'New Mexico':
#         df2.iloc[[i],[61]] = 1
#     elif df2.loc[i]["statename"]== 'New York':
#         df2.iloc[[i],[62]] = 1
#     elif df2.loc[i]["statename"]== 'North Carolina':
#         df2.iloc[[i],[63]] = 1
#     elif df2.loc[i]["statename"]== 'North Dakota':
#         df2.iloc[[i],[64]] = 1
#     elif df2.loc[i]["statename"]== 'Ohio':
#         df2.iloc[[i],[65]] = 1
#     elif df2.loc[i]["statename"]== 'Oklahoma':
#         df2.iloc[[i],[66]] = 1
#     elif df2.loc[i]["statename"]== 'Oregon':
#         df2.iloc[[i],[67]] = 1
#     elif df2.loc[i]["statename"]== 'Pennsylvania':
#         df2.iloc[[i],[68]] = 1
#     elif df2.loc[i]["statename"]== 'Rhode Island':
#         df2.iloc[[i],[69]] = 1
#     elif df2.loc[i]["statename"]== 'South Carolina':
#         df2.iloc[[i],[70]] = 1
#     elif df2.loc[i]["statename"]== 'South Dakota':
#         df2.iloc[[i],[71]] = 1
#     elif df2.loc[i]["statename"]== 'Tennessee':
#         df2.iloc[[i],[72]] = 1
#     elif df2.loc[i]["statename"]== 'Texas':
#         df2.iloc[[i],[73]] = 1
#     elif df2.loc[i]["statename"]== 'Utah':
#         df2.iloc[[i],[74]] = 1
#     elif df2.loc[i]["statename"]== 'Vermont':
#         df2.iloc[[i],[75]] = 1
#     elif df2.loc[i]["statename"]== 'Virginia':
#         df2.iloc[[i],[76]] = 1
#     elif df2.loc[i]["statename"]== 'Washington':
#         df2.iloc[[i],[77]] = 1
#     elif df2.loc[i]["statename"]== 'West Virginia':
#         df2.iloc[[i],[78]] = 1
#     elif df2.loc[i]["statename"]== 'Wisconsin':
#         df2.iloc[[i],[79]] = 1
#     elif df2.loc[i]["statename"]== 'Wyoming':
#         df2.iloc[[i],[80]] = 1
#     elif df2.loc[i]["statename"]== 'Dist. of Col.':
#         df2.iloc[[i],[81]] = 1
#     else: 
#         df2.iloc[[i],[82]] = 1

In [None]:
#Test to make sure state fiexed effects worked
# for i in range(1051):
#     if df2.loc[i]["statename"]== 'California':
#         print(df2.loc[i]["CA"])

In [None]:
#Another Test to make sure state fiexed effects worked
# print(df2[df2['test'] == 1])

In [None]:
#Creating lagged varibles for the democratic vote percentage
df_new['lag_dem_per'] = df_new.groupby('statename')['dem_per'].shift(1)
df_new['lag2_dem_per'] = df_new.groupby('statename')['dem_per'].shift(2)

In [None]:
#Creating a Counter variable for year
df_new['counter'] = df_new.groupby(['year']).ngroup()

In [None]:
#Interaction between South and Year
df_new['int_south_year'] = df_new['region_s'] * df_new['counter']

In [None]:
#Run this if we want to cap the primary from 30-70%.
df_new.loc[df_new['dem_primary'] > 70, 'dem_primary'] = 70
df_new.loc[df_new['dem_primary'] < 30, 'dem_primary'] = 30

In [None]:
#Lagging the Berry State Ideology variables
#Creating lagged varibles for the democratic vote percentage
df_new['lag_citi'] = df_new.groupby('statename')['citi6016'].shift(1)
df_new['lag_nom'] = df_new.groupby('statename')['inst6017_nom'].shift(1)

In [None]:
#Creating a csv file to see if we want to remove any columns.
df_new.to_csv(r"C:\\Users\\mawal\\OneDrive - Binghamton University\\Desktop\\My Papers\\Papers\\Forecasting_2020\\total.csv", index = False)

In [None]:
#read in the file again in case we adjust any columns in excel.
df_new = pd.read_csv(r"C:\\Users\\mawal\\OneDrive - Binghamton University\\Desktop\\My Papers\\Papers\\Forecasting_2020\\total.csv")
#df_new = pd.read_csv(r"C:\\Users\\mawal\\OneDrive - Binghamton University\\Desktop\\My Papers\\Papers\\Forecasting_2020\\test.csv")


In [None]:
#Creating our Training set (X) and our response variable (y)
# cols = ['year', 'third_party', 'dem_inc', 'rep_inc', 'dem_inc_party', 'unem_dec_t_1', 'unem_per_change_t_2_t_1', 'inflation_t_1', 'last_recession', 'greatwar_years']
#X = df2[cols]
# X = df_new.iloc[:,[4, 9,10,12, 13, 14, 15, 16, 17, 18, 19, 23, 25,26,28,29,30,31,35,38,39,40,41,42,43,44,45,46,47,48,49,50,51]]
X = df_new.dropna() # Dropping na values
X =X[~X.isin([np.nan, np.inf, -np.inf]).any(1)] #This removes infinite and NaN values.
X = X.reset_index() #Resetting the index
y = X['dem_per'] #This is the response variable
X = X.drop('dem_per', axis = 1) #Remove the response variable from our predictors.
# X2 = df2.iloc[:, 31:81]

# X = pd.concat([X, X2],  axis=1)

# print(np.any(np.isnan(X)))
# print(np.all(np.isfinite(X)))

In [None]:
#Writing new data to csv file
df_new.to_csv(r"C:\\Users\\mawal\\OneDrive - Binghamton University\\Desktop\\My Papers\\Papers\\Forecasting_2020\\test2.csv", index = False)
X.to_csv(r"C:\\Users\\mawal\\OneDrive - Binghamton University\\Desktop\\My Papers\\Papers\\Forecasting_2020\\X.csv", index = False)
y.to_csv(r"C:\\Users\\mawal\\OneDrive - Binghamton University\\Desktop\\My Papers\\Papers\\Forecasting_2020\\y.csv", index = False)

In [None]:
# y2 = y > 50
# y2 = y2 * 1 #converting the booleans to integers
# y2

In [None]:
#Training and Testing Splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=333)

In [None]:
#Converting the response variable to work in the algorithm
test = np.array(y_train)
y_train = []
for i in test:
    y_train.append(i[0])

test2 = np.array(y_test)
y_test = []

for i in test2:
    y_test.append(i[0])
    
test3 = np.array(y)
y = []

for i in test3:
    y.append(i[0])

In [None]:
#Random Grid Search with Random Forest
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

#Best parameters
rf_random.best_params_

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(estimator = rf, param_grid = random_grid, cv = 5, verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

#Best parameters
rf_random.best_params_

In [None]:
#After Random Search do Grid Search based on the results
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False, True],
    'max_depth': [90, 100, 110, 120],
    'max_features': ['auto','sqrt'],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [1, 2, 3],
    'n_estimators': [1600, 1800, 2000]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
#Training with the Grid Search
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
#After Random Search do Grid Search based on the results
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [70,80,90],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1],
    'min_samples_split': [2],
    'n_estimators': [2000,2100,2200]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
#Training with the Grid Search
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
#After Random Search do Grid Search based on the results
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [10,20,30,40,50,60,70],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1],
    'min_samples_split': [2],
    'n_estimators': [2050,2100,2150]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
#Training with the Grid Search
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
#Predictions for the Random Forest
rforest = RandomForestRegressor(bootstrap = False, max_depth= 30,max_features= 'sqrt', min_samples_leaf= 1,min_samples_split= 2, n_estimators= 2100)
rforest.fit(X_train, y_train)
scores = cross_val_score(rforest, X_train, y_train, cv=5, scoring = 'neg_root_mean_squared_error')
np.mean(scores)

In [None]:
#Gradient Boosting Machine
# Hyperparameter tuning: https://medium.com/all-things-ai/in-depth-parameter-tuning-for-gradient-boosting-3363992e9bae

#Random Grid Search with Gradient Boosting
learning_rate = [0.01, .05, .1, .5, 1]
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
max_depth = np.linspace(1, 32, 32, endpoint=True)
min_samples_split = np.linspace(0.1, 1.0, 10, endpoint=True)
min_samples_leaf = np.linspace(0.1, 0.5, 5, endpoint=True)
max_features = list(range(1,X_train.shape[1]))

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate': learning_rate}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
gbm = GradientBoostingRegressor(random_state=42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
gbm_random = RandomizedSearchCV(estimator = gbm, param_distributions = random_grid, n_iter = 100, random_state=42, cv = 5, verbose=2, n_jobs = -1)
# Fit the random search model
gbm_random.fit(X_train, y_train)

#Best parameters
gbm_random.best_params_

In [None]:
#After Random Search do Grid Search based on the results
# Create the parameter grid based on the results of random search 
param_grid = {
    'learning_rate': [.05, .1, .15],
    'max_depth': [20, 23, 27, 30, 33],
    'max_features': [12,13,14,15,16],
    'min_samples_leaf': [0.05, .1, .15],
    'min_samples_split': [.3,.4,.5],
    'n_estimators': [80,90,100,110,120]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = gbm, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
#Training with the Grid Search
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
#After Random Search do Grid Search based on the results
# Create the parameter grid based on the results of random search 
param_grid = {
    'learning_rate': [.05, .1,.15],
    'max_depth': [16, 18, 20, 22, 24],
    'max_features': [13, 14],
    'min_samples_leaf': [.025, .05, .075],
    'min_samples_split': [.2, .3, .4],
    'n_estimators': [100, 110, 120]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = gbm, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
#Training with the Grid Search
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
#After Random Search do Grid Search based on the results
# Create the parameter grid based on the results of random search 
param_grid = {
    'learning_rate': [.1],
    'max_depth': [12, 14, 16],
    'max_features': [14],
    'min_samples_leaf': [.05],
    'min_samples_split': [.3],
    'n_estimators': [105, 110]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = gbm, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
#Training with the Grid Search
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
#After Random Search do Grid Search based on the results
# Create the parameter grid based on the results of random search 
param_grid = {
    'learning_rate': [.1],
    'max_depth': [2, 4, 6, 8, 10, 12],
    'max_features': [14],
    'min_samples_leaf': [.05],
    'min_samples_split': [.3],
    'n_estimators': [110]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = gbm, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
#Training with the Grid Search
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
#Predictions for the GBM
gbm = GradientBoostingRegressor(learning_rate = .1, max_depth= 6,max_features= 14, min_samples_leaf= 0.05,min_samples_split= 0.3, n_estimators= 110, random_state = 42)
gbm.fit(X_train, y_train)
scores = cross_val_score(gbm, X_train, y_train, cv=5, scoring = 'neg_root_mean_squared_error')
np.mean(scores)

In [None]:
gbm = GradientBoostingRegressor(learning_rate = .1, max_depth= 6,max_features= 14, min_samples_leaf= 0.05,min_samples_split= 0.3, n_estimators= 110, random_state = 42)
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)
gbm.score(X_test, y_test)

In [None]:
#Creating an array to calculate the testing accuracy.
y_test = y_test.reshape(180, 1) #reshaping the data to be the length of the test set X 1
y_pred = y_pred.reshape(180, 1) #reshaping the data to be the length of the prediction array X 1
results = np.concatenate((y_test, y_pred), axis = 1) #Combinng the two arrays [[Y,Y_hat],[Y,Y_hat],...]

#Add 1 for every instance where both the prediction and training response variable are either above or below 50 percent
results2 = 0
for i in range(180):
    if results[i][0] > 50 and results[i][1] > 50:
        results2 += 1
    elif results[i][0] < 50 and results[i][1] < 50:
        results2 += 1
    else:
        results2 += 0

#Calculaing the accuray
results2/len(results)

In [None]:
gbm = GradientBoostingRegressor(learning_rate = .1, max_depth= 6,max_features= 14, min_samples_leaf= 0.05,min_samples_split= 0.3, n_estimators= 110, random_state = 42)
gbm.fit(X, y)
y_pred = gbm.predict(X)
gbm.score(X, y)

In [None]:
#Creating an array to calculate the accuracy of the final model on all of the data.
y = y.reshape(600, 1) #reshaping the data to be the length of the test set X 1
y_pred = y_pred.reshape(600, 1) #reshaping the data to be the length of the prediction array X 1
results = np.concatenate((y, y_pred), axis = 1) #Combinng the two arrays [[Y,Y_hat],[Y,Y_hat],...]

#Add 1 for every instance where both the prediction and training response variable are either above or below 50 percent
results2 = 0
for i in range(600):
    if results[i][0] > 50 and results[i][1] > 50:
        results2 += 1
    elif results[i][0] < 50 and results[i][1] < 50:
        results2 += 1
    else:
        results2 += 0

#Calculaing the accuray
results2/len(results)

In [None]:
results