# Random Forest Without Hyperparameter Tuning

In [None]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from sklearn import metrics 
import csv
import pickle as pk
def train_test_split(item):
    training=pd.read_csv('../data/test_training_data/'+item+'/final_training_data.csv',sep=':')
    y_train=training.filter(regex=item)
    X_train=training.drop(y_train, axis=1)
    test=pd.read_csv('../data/test_training_data/'+item+'/final_test_data.csv',sep=':')
    y_test=test.filter(regex=item)
    X_test=test.drop(y_test, axis=1)
    return X_train,X_test, y_train,y_test

def print_save_metrics(y_test,y_pred,item):
    r2_score=metrics.r2_score(y_test,y_pred)
    MAE=metrics.mean_absolute_error(y_test, y_pred)
    MSE=metrics.mean_squared_error(y_test, y_pred)
    RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    with open('../data/random_forest/RandomForest_metrics_'+item+'.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["r2_score","Mean Absolute Error (MAE)",'Mean Squared Error (MSE)','Root Mean Squared Error (RMSE)'])
        writer.writerow([r2_score,MAE,MSE,RMSE])
    print("r2_score:"+item,r2_score)
    print('Mean Absolute Error (MAE):'+item, MAE)
    print('Mean Squared Error (MSE):'+item, MSE)
    print('Root Mean Squared Error (RMSE):'+item, RMSE)

def init_column_maps():
    column_maps = []
    with open("../parameters/column_map.txt") as f:
        parameters = f.readlines()
        for p in parameters:
            p = p.replace("\n", "")
            p = tuple(p.split(":"))
            column_maps.append(p)
    return column_maps

def feature_importance(important_list):
    lst_column_map=init_column_maps()
    lst_final=[]
    # important_list=sorted(list(zip(regressor.feature_importances_,X_test.columns)),key =lambda x: x[0] ,reverse=True)[:10]
    for import_item in important_list:
        import_item_rem=import_item[1].replace('_imputed','')
        for column_item in lst_column_map:
            if import_item_rem == column_item[0]:
                import_renamed=column_item[2]
                l = list(import_item)
                l[1] = import_renamed
                lst_final.append(tuple(l))
                break
    return lst_final
def random_forest():
    output_variable= ['School_Code','OP1','OP2','OP6','OP3','OP4','OP5','OP7','OP8','OP9','OP10','OP11','OP12','OP13','OP14']
    output_variable.remove('School_Code')
    for item in output_variable:
        X_train,X_test, y_train,y_test=train_test_split(item)
        regressor = RandomForestRegressor()
        regressor.fit(X_train,y_train)
        y_pred=regressor.predict(X_test)
        with open('../data/random_forest/'+item+'.pkl', 'wb') as pickle_file:
            pk.dump(y_pred, pickle_file)
        
        important_list=sorted(list(zip(regressor.feature_importances_,X_test.columns)),key =lambda x: x[0] ,reverse=True)[:10]
        import_lst=feature_importance(important_list)
        file = open('../data/random_forest/feature_importance_'+item+'.csv', 'w+', newline ='')
        with file:    
            write = csv.writer(file)
            write.writerows(import_lst)
        print_save_metrics(y_test,y_pred,item)
random_forest() 

# Random Forest Hyperparameter Tuning

In [23]:
import numpy as np
import pandas as pd
from sklearn import metrics 
import csv
import pickle as pk
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
def print_save_metrics_tuned_manually(y_test,y_pred,item):
    r2_score=metrics.r2_score(y_test,y_pred)
    MAE=metrics.mean_absolute_error(y_test, y_pred)
    MSE=metrics.mean_squared_error(y_test, y_pred)
    RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    with open('../data/random_forest/RandomForest_metrics_tunedmanually'+item+'.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["r2_score","Mean Absolute Error (MAE)",'Mean Squared Error (MSE)','Root Mean Squared Error (RMSE)'])
        writer.writerow([r2_score,MAE,MSE,RMSE])
    print("r2_score:"+item,r2_score)
    print('Mean Absolute Error (MAE):'+item, MAE)
    print('Mean Squared Error (MSE):'+item, MSE)
    print('Root Mean Squared Error (RMSE):'+item, RMSE)
def init_column_maps():
    column_maps = []
    with open("../parameters/column_map.txt") as f:
        parameters = f.readlines()
        for p in parameters:
            p = p.replace("\n", "")
            p = tuple(p.split(":"))
            column_maps.append(p)
    return column_maps

def feature_importance(important_list):
    lst_column_map=init_column_maps()
    lst_final=[]
    # important_list=sorted(list(zip(regressor.feature_importances_,X_test.columns)),key =lambda x: x[0] ,reverse=True)[:10]
    for import_item in important_list:
        import_item_rem=import_item[1].replace('_imputed','')
        for column_item in lst_column_map:
            if import_item_rem == column_item[0]:
                import_renamed=column_item[2]
                l = list(import_item)
                l[1] = import_renamed
                lst_final.append(tuple(l))
                break
    return lst_final
def train_test_split(item):
    training=pd.read_csv('../data/test_training_data/'+item+'/final_training_data.csv',sep=':')
    y_train=training.filter(regex=item)
    X_train=training.drop(y_train, axis=1)
    test=pd.read_csv('../data/test_training_data/'+item+'/final_test_data.csv',sep=':')
    y_test=test.filter(regex=item)
    X_test=test.drop(y_test, axis=1)
    return X_train,X_test, y_train,y_test

n_estimators=[int(x) for x in np.linspace(start = 80, stop = 200, num = 5)]
max_depth=[int(x) for x in np.linspace(5, 15, num = 3)]
min_samples_split=[1,5,10]
min_samples_leaf=[1,5,10]
# bootstrap=[True,False]
param_grid={'n_estimators' : n_estimators,
            # 'max_depth' : max_depth,
            'min_samples_leaf': min_samples_leaf
            # 'bootstrap':bootstrap
            }
print(param_grid)
item='OP9'
X_train,X_test, y_train,y_test=train_test_split(item)
model = RandomForestRegressor()
rf_Grid=GridSearchCV(estimator=model,param_grid=param_grid,verbose=2,cv=3,n_jobs=1)
rf_Grid.fit(X_train,y_train)
print(rf_Grid.best_params_)
rf_regressor=rf_Grid.best_estimator_
y_predict=rf_regressor.predict(X_test)
# r2_score=metrics.r2_score(y_test,y_predict)
with open('../data/random_forest/tuned_manually'+item+'.pkl', 'wb') as pickle_file:
    pk.dump(y_predict, pickle_file)
important_list=sorted(list(zip(rf_regressor.feature_importances_,X_test.columns)),key =lambda x: x[0] ,reverse=True)[:10]
import_lst=feature_importance(important_list)
file = open('../data/random_forest/feature_importance_tuned_manually'+item+'.csv', 'w+', newline ='')
with file:    
    write = csv.writer(file)
    write.writerows(import_lst)
print_save_metrics_tuned_manually(y_test,y_predict,item)

ed. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
[CV] .............. min_samples_leaf=1, n_estimators=80, total=   0.3s
[CV] min_samples_leaf=1, n_estimators=80 .............................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)
[CV] .............. min_samples_leaf=1, n_estimators=80, total=   0.4s
[CV] min_samples_leaf=1, n_estimators=80 .............................
  estimator.fit(X_train, y_train, **fit_params)
[CV] .............. min_samples_leaf=1, n_estimators=80, total=   0.4s
[CV] min_samples_leaf=1, n_estimators=110 ............................
  estimator.fit(X_train, y_train, **fit_params)
[CV] ............. min_samples_leaf=1, n_estimators=110, total=   0.4s
[CV] min_samples_leaf=1, n_estimators=110 ............................
  estimator.fit(X_train, y_train, **fit_params)
[CV] ............. min_samples_leaf

# Random Forest With PCA

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import pickle as pk
import csv

def train_test_split_PCA(item):
    
    item=item.replace('%','')
    item=item.replace(' ','')
    item=item.strip()
    training=pd.read_csv('../data/'+item+'/imputed_training_data.csv')
    training=training.drop(['Town'], axis=1)
    y_train=training.filter(regex=item)
    X_train=training.drop(y_train, axis=1)
    test=pd.read_csv('../data/'+item+'/imputed_test_data.csv')
    test=test.drop(['Town'], axis=1)
    y_test=test.filter(regex=item)
    X_test=test.drop(y_test, axis=1)
    return X_train,X_test, y_train,y_test 


def print_save_metrics(y_test,y_pred):
    r2_score=metrics.r2_score(y_test,y_pred)
    MAE=metrics.mean_absolute_error(y_test, y_pred)
    MSE=metrics.mean_squared_error(y_test, y_pred)
    RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    with open('../data/random_forest/RandomForest_metrics_PCA_'+item+'.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["r2_score","Mean Absolute Error (MAE)",'Mean Squared Error (MSE)','Root Mean Squared Error (RMSE)'])
        writer.writerow([r2_score,MAE,MSE,RMSE])
        

    print("r2_score:"+item,r2_score)
    print('Mean Absolute Error (MAE):'+item, MAE)
    print('Mean Squared Error (MSE):'+item, MSE)
    print('Root Mean Squared Error (RMSE):'+item, RMSE)

output=output_variable.drop(['School Code'], axis=1)
for item in output:
    X_train,X_test, y_train,y_test=train_test_split_PCA(item)
    processed_input=pd.read_csv('../data/scaled_processed_input'+item+'.csv')
    with open('../data/scaled_pca'+item+'.pkl', 'rb') as pickle_file:
        pca = pk.load(pickle_file)
        scaled_data_train = pca.transform(X_train)
        X_train = pd.DataFrame(data = scaled_data_train)
       
        scaled_data_test = pca.transform(X_test)
        X_test = pd.DataFrame(data = scaled_data_test)

        regressor = RandomForestRegressor(n_estimators=150)
        regressor.fit(X_train,y_train)
        y_pred=regressor.predict(X_test)
        with open('../data/random_forest/PCA_'+item+'.pkl', 'wb') as pickle_file:
            pk.dump(y_pred, pickle_file)
        important_list=sorted(list(zip(regressor.feature_importances_,X_test.columns)),key =lambda x: x[0] ,reverse=True)[:10]
        file = open('../data/random_forest/feature_importance_PCA_'+item+'.csv', 'w+', newline ='')
        with file:    
            write = csv.writer(file)
            write.writerows(important_list)
        print_save_metrics(y_test,y_pred)
        