# Data modeling #

In [94]:
#Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#Feature selection
from sklearn.feature_selection import RFE

#Lineal regression
from sklearn.linear_model import LinearRegression
#Ridge
from sklearn.linear_model import Ridge, Lasso
#Decission trees
from sklearn.tree import DecisionTreeRegressor
#Random forest
from sklearn.ensemble import RandomForestRegressor
#Support Vector Machine (SVR)
from sklearn.svm import SVR
#Neural Nets
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from scikeras.wrappers import KerasRegressor
from keras.optimizers import SGD, Adam
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#XGboost
import xgboost as xgb

#Performance metrics
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error, r2_score,accuracy_score

#Cross validation and train-test split
from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict, KFold

#Grid Search
from sklearn.model_selection import GridSearchCV

In [39]:
df = pd.read_csv('cleanedData.csv')
df = df[df.columns[1:]]
df.head(5)

Unnamed: 0,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,agePct65up,...,PctSameHouse85,PctSameCity85,PctSameState85,RacialMatchCommPol,LandArea,PopDens,PctUsePubTrans,LemasGangUnitDeploy,LemasPctOfficDrugUn,ViolentCrimesPerPop
0,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,...,0.5,0.51,0.64,0.94,0.12,0.26,0.2,0.5,0.32,0.2
1,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.34,0.6,0.52,0.0,0.02,0.12,0.45,0.0,0.0,0.67
2,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.54,0.67,0.56,0.0,0.01,0.21,0.02,0.0,0.0,0.43
3,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,0.21,...,0.73,0.64,0.65,0.0,0.02,0.39,0.28,0.0,0.0,0.12
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.64,0.61,0.53,0.0,0.04,0.09,0.02,0.0,0.0,0.03


In [92]:
#Models

lr = {"name":"Linear Regression",
      "object": LinearRegression(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True
}

randomForest = {"name":"Random Forest",
      "object": RandomForestRegressor(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True
}

ridge = {"name":"Ridge",
      "object": Ridge(),
      "paramsGrid": {'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]},
      "requires_feature_selection": True,
      "requires_scalling":True
}

lasso = {"name":"Lasso",
      "object": Lasso(max_iter=10000),
      "paramsGrid": {'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]},
      "requires_feature_selection": False,
      "requires_scalling":True
}

SVR = {"name":"Support Vector Regressor",
      "object": LinearRegression(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True
}

nn = {"name":"Neural Net",
      "object": LinearRegression(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True
}

xgboost = {"name":"XGBoost",
      "object": LinearRegression(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True
}


models = [lr,ridge,lasso]

In [98]:
def modeling(df,modelDict,seed=1,threshold_for_selection=.3,test_size=.3,cv_splits=5):
    
    #Feature selection
    if modelDict["requires_feature_selection"] == True:
        corr = df.corr()["ViolentCrimesPerPop"].sort_values(ascending=False)
        #Creates a new dataframe with the selected columns
        df = df[corr[corr > threshold_for_selection].index] 
    
    #X and Y
    X = df.drop('ViolentCrimesPerPop', axis=1)
    Y = df['ViolentCrimesPerPop']

   

    #Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size,random_state=seed)


    #Scalling
    if modelDict["requires_scalling"] == True:
        sc = StandardScaler()
        X_train, X_test = sc.fit_transform(X_train), sc.fit_transform(X_test)
    

    #Kfold object
    kfold = KFold(n_splits=cv_splits, random_state=seed, shuffle=True)

    #Grid Search
    gridSearch = GridSearchCV(estimator=modelDict["object"],param_grid=modelDict["paramsGrid"],cv=kfold,scoring="r2") #If grid search is empty it doesn't do CV, just kfolds
    gridSearch.fit(X_train, y_train)

    #Getting best model
    bestModel = gridSearch.best_estimator_

    #Predictions 
    y_pred = bestModel.predict(X_test)

    #Scoring
    scores = {'name': modelDict['name'],
              'mse':mean_squared_error(y_test, y_pred),
              'mae':mean_absolute_error(y_test, y_pred),
              'r2':r2_score(y_test, y_pred),}
              #'accuracy':accuracy_score(y_test, y_pred)}
    

    return [modelDict['name'],mean_squared_error(y_test, y_pred),mean_absolute_error(y_test, y_pred),r2_score(y_test, y_pred),gridSearch.best_params_]

In [99]:
#Main

results = pd.DataFrame(columns=['model','mse', 'mae', 'r2','hiperparameters'])

for model in models:

    row = modeling(df,modelDict=model)
    results.loc[len(results)] = row

results

Unnamed: 0,model,mse,mae,r2,hiperparameters
0,Linear Regression,0.019494,0.094724,0.625266,{}
1,Ridge,0.019496,0.094703,0.625226,{'alpha': 30}
2,Lasso,0.018796,0.093348,0.638681,{'alpha': 0.001}
