# Data modeling #

In [2]:
#Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import tensorflow
warnings.filterwarnings('ignore')
#Feature selection
from sklearn.feature_selection import RFE
import time

#Lineal regression
from sklearn.linear_model import LinearRegression
#Ridge
from sklearn.linear_model import Ridge, Lasso
#Decission trees
from sklearn.tree import DecisionTreeRegressor
#Random forest
from sklearn.ensemble import RandomForestRegressor
#Support Vector Machine (SVR)
from sklearn.svm import SVR
#Neural Nets
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from scikeras.wrappers import KerasRegressor
from keras.optimizers import SGD, Adam
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#XGboost
import xgboost as xgb

#Performance metrics
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error, r2_score,accuracy_score

#Cross validation and train-test split
from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict, KFold

#Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

In [3]:
pd.set_option('display.max_colwidth', None)  # For Pandas version < 1.0.0, use -1 instead of None


In [4]:
df = pd.read_csv('cleanedData.csv')
df = df[df.columns[1:]]


In [6]:
seed = 1

In [17]:
df.shape[1]

103

In [21]:
def create_model():
    
    # create model
    model = Sequential()
    model.add(Dense(5, input_shape=(28,), activation='tanh'))
    model.add(Dense(1, activation='linear'))
    return model

tensorflow.random.set_seed(seed)


In [28]:
#Models

lr = {"name":"Linear Regression",
      "object": LinearRegression(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

rf = {"name":"Random Forest",
      "object": RandomForestRegressor(criterion='squared_error',
                               min_samples_leaf=3,
                               bootstrap=True,
                               oob_score=False,
                               random_state=seed,
                               verbose=0),
      "paramsGrid": {'max_depth': [3,3.5,4,4.5], 'min_samples_split': [3.5,4,4.5,5],'n_estimators': [300,500,800]},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

ridge = {"name":"Ridge",
      "object": Ridge(),
      "paramsGrid": {'alpha':[10,20,29,29,30,31,32,33,35,40,45,50]},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

lasso = {"name":"Lasso",
      "object": Lasso(max_iter=10000),
      "paramsGrid": {'alpha':[1e-4,1e-3,1e-2,1,5,10,20,30,35]},
      "requires_feature_selection": False,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

svr = {
    "name": "Support Vector Regressor",
    "object": SVR(),
    "paramsGrid": [{'kernel': ['rbf'], 'gamma': [.0009,.001,.005],'C': [1500,2900,4000]},{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
    "requires_feature_selection": True,
    "requires_scalling": True,
    "needs_outliers_handling": True
}


boost = {"name":"XGBoost",
      "object": xgb.XGBRegressor(objective='reg:squarederror', seed=seed),
      "paramsGrid": {'colsample_bytree': [0.1,0.3,0.5], 'n_estimators':[10,15,17,20], 'max_depth': [2,3,4,5]}, 
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

nngrid = dict(optimizer__learning_rate=[0.1, 0.05, 0.01], optimizer__momentum=[0.8, 0.6, 0.4])
nn = { "name": "Neural Net",
    "object": KerasRegressor(model=create_model, loss='mean_squared_error',
                        optimizer=tensorflow.keras.optimizers.legacy.SGD,
                          epochs=100, batch_size=10, verbose=0),
    "paramsGrid": nngrid,
    "requires_feature_selection": True,
    "requires_scalling": True,
    "needs_outliers_handling": True
}

models = [lr,ridge,lasso,rf,svr,boost,nn]


In [24]:
def modeling(df,modelDict,seed=1,threshold_for_selection=.3,test_size=.3,cv_splits=5):
    
    #Feature selection
    if modelDict["requires_feature_selection"] == True:
        corr = df.corr()["ViolentCrimesPerPop"].sort_values(ascending=False)
        #Creates a new dataframe with the selected columns
        df = df[corr[corr > threshold_for_selection].index]

    

    X = df.drop('ViolentCrimesPerPop', axis=1)
    Y = df['ViolentCrimesPerPop']

   

    #Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size,random_state=seed)

    #Kfold object
    kfold = KFold(n_splits=cv_splits, random_state=seed, shuffle=True)

    #Grid Search
    gridSearch = GridSearchCV(estimator=modelDict["object"],param_grid=modelDict["paramsGrid"],cv=kfold,scoring="r2") #If grid search is empty it doesn't do CV, just kfolds
    gridSearch.fit(X_train, y_train)

    #Getting best model
    bestModel = gridSearch.best_estimator_

    #Predictions 
    y_pred = bestModel.predict(X_test)
    

    return [modelDict['name'],mean_squared_error(y_test, y_pred),mean_absolute_error(y_test, y_pred),r2_score(y_test, y_pred),gridSearch.best_params_,bestModel]

In [29]:
#Main

results = pd.DataFrame(columns=['model','mse', 'mae', 'r2','hiperparameters','modelObject'])
times = []
for model in models:
    print(f'Making {model["name"]} model...')
    start = time.time()
    row = modeling(df,modelDict=model)

    end = time.time()
    delta = round(end-start,3)
    times.append(delta)

    results.loc[len(results)] = row

results["duration"] = times

results[[col for col in results.columns if col != "modelObject"]]

Making Linear Regression model...
Making Ridge model...
Making Lasso model...
Making Random Forest model...
Making Support Vector Regressor model...
Making XGBoost model...
Making Neural Net model...


Unnamed: 0,model,mse,mae,r2,hiperparameters,duration
0,Linear Regression,0.019465,0.09389,0.625821,{},0.02
1,Ridge,0.019681,0.094854,0.621684,{'alpha': 10},0.111
2,Lasso,0.01949,0.093793,0.625353,{'alpha': 0.0001},0.113
3,Random Forest,0.020273,0.097474,0.610289,"{'max_depth': 4, 'min_samples_split': 4, 'n_estimators': 500}",73.19
4,Support Vector Regressor,0.01946,0.09281,0.625919,"{'C': 2900, 'gamma': 0.001, 'kernel': 'rbf'}",23.656
5,XGBoost,0.019073,0.091634,0.633357,"{'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 20}",1.996
6,Neural Net,0.019925,0.098403,0.616984,"{'optimizer__learning_rate': 0.01, 'optimizer__momentum': 0.4}",134.446
