In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from ast import literal_eval
from numpy import nan
import xgboost
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from fancyimpute import KNN
import missingno as msno
from copy import deepcopy
import impyute.imputation.cs.mice as mice_imputation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, explained_variance_score, median_absolute_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold

pd.set_option('display.max_columns', 120)
pd.set_option('display.max_rows', 120)

In [47]:
def mape(y_actual, y_pred):
    y_actual = y_actual['lifespan'].tolist()
    y_pred = [i[0] if type(i) == list else i for i in y_pred.tolist()]
    return (np.sum([np.abs((i-j)/i) for i,j in zip(y_actual, y_pred)])*100)/len(y_actual)

def regression_report(y_true, y_pred):
    ev = explained_variance_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    try:
        msle = mean_squared_log_error(y_true, y_pred)
    except:
        msle = "error"
    medal = median_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape_score = mape(y_true, y_pred)
    response = {
        "Explained Variance": ev,
        "Mean Squared Error": mse,
        "Mean Absolute Error": mae,
        "Mean Squared Log Error": msle,
        "Median Absolute Error": medal,
        "MAPE": mape_score,
        "r2_score": r2,
        "RMSE": mse**0.5
    }
    return response

In [48]:
df = pd.read_csv("output/NAG_DETAILS_v8.csv")

In [49]:
delete = ["nag_name", "ideology(s)",'objective(s)']
df.drop(delete, axis=1, inplace=True)

In [50]:
X = df.drop(['nagcode_1', 'lifespan'], axis=1)
y = df[['lifespan']]
# X_complete = mice_imputation(X.values)
X_complete = KNN(k=4).fit_transform(X.values)

X = pd.DataFrame(X_complete, columns=X.columns)

Imputing row 1/459 with 0 missing, elapsed time: 0.089
Imputing row 101/459 with 0 missing, elapsed time: 0.091
Imputing row 201/459 with 24 missing, elapsed time: 0.093
Imputing row 301/459 with 0 missing, elapsed time: 0.097
Imputing row 401/459 with 0 missing, elapsed time: 0.100


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [52]:
regressor = XGBRegressor()

In [53]:
params={
 "learning_rate"    : [0.0001, 0.005, 0.01, 0.025, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.5 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 7, 8, 10],
 "min_child_weight" : [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4, 0.5, 0.6, 0.7 ],
 "colsample_bytree" : [ 0.1, 0.3, 0.4, 0.5 , 0.7, 0.8 ],
  "n_estimators": [100, 150, 200, 250, 300, 350, 400],
  "sub_sample": [1, 0.8]  
}

In [54]:
my_scorer = make_scorer(mape, greater_is_better=False)

In [55]:

random_search=RandomizedSearchCV(regressor,param_distributions=params,
                           n_iter=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=6,
                           cv=300,
                           verbose=3)

In [56]:
import time
start = time.time()
random_search.fit(X_train,y_train)
end = time.time()
print(end-start)

Fitting 300 folds for each of 5 candidates, totalling 1500 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    1.3s
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:    3.5s
[Parallel(n_jobs=6)]: Done 276 tasks      | elapsed:    7.3s
[Parallel(n_jobs=6)]: Done 500 tasks      | elapsed:   45.9s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 1140 tasks      | elapsed:  3.4min


250.23839664459229


[Parallel(n_jobs=6)]: Done 1500 out of 1500 | elapsed:  4.2min finished


In [57]:
random_search.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0.2,
 'learning_rate': 0.25,
 'max_depth': 3,
 'min_child_weight': 9,
 'n_estimators': 100,
 'sub_sample': 1}

In [58]:
model = XGBRegressor(**{'colsample_bytree': 0.5,
 'gamma': 0.2,
 'learning_rate': 0.25,
 'max_depth': 3,
 'min_child_weight': 9,
 'n_estimators': 100,
 'sub_sample': 1})
model.fit(X_train, y_train) 

Y_pred=model.predict(X_test)
Y_pred_train=model.predict(X_train)

regression_report(y_train, Y_pred_train),regression_report(y_test, Y_pred)

({'Explained Variance': 0.9888478627061943,
  'MAPE': 23.517789303097484,
  'Mean Absolute Error': 0.9458890024582761,
  'Mean Squared Error': 1.6426317339661558,
  'Mean Squared Log Error': 'error',
  'Median Absolute Error': 0.7377566993236542,
  'RMSE': 1.2816519550822507,
  'r2_score': 0.9888478578820806},
 {'Explained Variance': 0.7599811656505073,
  'MAPE': 86.34369262683049,
  'Mean Absolute Error': 4.126031720443912,
  'Mean Squared Error': 40.25715697158024,
  'Mean Squared Log Error': 'error',
  'Median Absolute Error': 2.964641571044922,
  'RMSE': 6.344852793531166,
  'r2_score': 0.7599702889575255})

In [37]:
import json
f = open("output/weight.json", "rb")
weights = json.load(f)
f.close()

In [38]:
weights[2]

{'colsample_bytree': 0.5,
 'gamma': 0.1,
 'learning_rate': 0.025,
 'max_depth': 5,
 'min_child_weight': 10,
 'n_estimators': 350}

In [323]:
def make_weight_string(weight_dict, config_headers):
    temp=[]
    for i in config_headers:
        temp.append("{}: {}".format(i, weight_dict.get(i)))
    return ",\n".join(temp)

def make_report_string(weight_dict, config_headers, alias):
    temp=[]
    if 'sub_sample' not in weight_dict:
        weight_dict.update({'sub_sample':1})
    for i in config_headers:
        temp.append("{}: {}".format(alias.get(i,i), weight_dict.get(i)))
    return ",\n".join(temp)
        

In [325]:
'''
Model-III: Using Xgboost for regression.


'''
import csv
fw = open("output/configurations.csv", "w")
writer = csv.writer(fw)
headers = ["Configuration Name", "Xgboost Hyperparameters", "Regression Report Training", "Regression Report Testing"]
config_headers = ["max_depth", "colsample_bytree", "gamma", "n_estimators", "learning_rate", "min_child_weight", "sub_sample"]
report_headers = ["Explained Variance","Mean Squared Error","Mean Absolute Error","Median Absolute Error","MAPE",  "r2_score", "RMSE"]
alias = {'RMSE': 'Root Mean Squared Error', 'MAPE': 'Mean Absolute Percentage Error', 'r2_score': 'R^2 Score'}
writer.writerow(headers)
for index, weight in enumerate(weights):
    col1 = "Configuration-{}".format(index+1)
    col2 = make_weight_string(weight, config_headers)
    model = XGBRegressor(**weight)
    model.fit(X_train, y_train) 

    Y_pred=model.predict(X_test)
    Y_pred_train=model.predict(X_train)

    train_report = regression_report(y_train, Y_pred_train)
    test_report = regression_report(y_test, Y_pred)
    col3 = make_report_string(train_report, report_headers, alias)
    col4 = make_report_string(test_report, report_headers, alias)
    writer.writerow([col1, col2, col3, col4])

fw.close()