# Data modeling #

In [29]:
#Libraries
import numpy as np 
import pandas as pd 
import tensorflow
import time

#Models  
from sklearn.svm import SVR
import xgboost as xgb
from scikeras.wrappers import KerasRegressor
from keras.models import Sequential
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import LinearRegression
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout


#Performance metrics
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error, r2_score,accuracy_score

#Cross validation, train-test split and kfold
from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict, KFold

#Grid Search
from sklearn.model_selection import GridSearchCV

In [30]:
pd.set_option('display.max_colwidth', None)  # For Pandas version < 1.0.0, use -1 instead of None


In [31]:
df = pd.read_csv('cleanedData.csv')
df = df[df.columns[1:]]


In [32]:
seed = 1

In [33]:
def create_model():
    
    # create model
    model = Sequential()
    model.add(Dense(5, input_shape=(28,), activation='tanh'))
    model.add(Dense(1, activation='linear'))
    return model

tensorflow.random.set_seed(seed)


In [34]:
#Models

lr = {"name":"Linear Regression",
      "object": LinearRegression(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

rf = {"name":"Random Forest",
      "object": RandomForestRegressor(criterion='squared_error',
                               min_samples_leaf=3,
                               bootstrap=True,
                               oob_score=False,
                               random_state=seed,
                               verbose=0),
      "paramsGrid": {'max_depth': [3,4,5], 'min_samples_split': [3,4,5],'n_estimators': [300,500,800]},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

ridge = {"name":"Ridge",
      "object": Ridge(),
      "paramsGrid": {'alpha':[10,20,29,29,30,31,32,33,35,40,45,50]},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

lasso = {"name":"Lasso",
      "object": Lasso(max_iter=10000),
      "paramsGrid": {'alpha':[1e-4,1e-3,1e-2,1,5,10,20,30,35]},
      "requires_feature_selection": False,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

svr = {
    "name": "Support Vector Regressor",
    "object": SVR(),
    "paramsGrid": [{'kernel': ['rbf'], 'gamma': [.0009,.001,.005],'C': [1500,2900,4000]},{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
    "requires_feature_selection": True,
    "requires_scalling": True,
    "needs_outliers_handling": True
}


boost = {"name":"XGBoost",
      "object": xgb.XGBRegressor(objective='reg:squarederror', seed=seed),
      "paramsGrid": {'colsample_bytree': [0.1,0.3,0.5], 'n_estimators':[10,15,17,20], 'max_depth': [2,3,4,5]}, 
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

nngrid = dict(optimizer__learning_rate=[0.1, 0.05, 0.01], optimizer__momentum=[0.8, 0.6, 0.4])
nn = { "name": "Neural Net",
    "object": KerasRegressor(model=create_model, loss='mean_squared_error',
                        optimizer=tensorflow.keras.optimizers.legacy.SGD,
                          epochs=100, batch_size=10, verbose=0),
    "paramsGrid": nngrid,
    "requires_feature_selection": True,
    "requires_scalling": True,
    "needs_outliers_handling": True
}

models = [nn,rf,lr,ridge,lasso,svr,boost]



In [35]:
def modeling(df,modelDict,seed=1,threshold_for_selection=.3,test_size=.3,cv_splits=5):
    
    #Feature selection
    if modelDict["requires_feature_selection"] == True:
        corr = df.corr()["ViolentCrimesPerPop"].sort_values(ascending=False)
        #Creates a new dataframe with the selected columns
        df = df[corr[corr > threshold_for_selection].index]

    

    X = df.drop('ViolentCrimesPerPop', axis=1)
    Y = df['ViolentCrimesPerPop']

   

    #Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size,random_state=seed)

    #Kfold object
    kfold = KFold(n_splits=cv_splits, random_state=seed, shuffle=True)

    #Grid Search
    gridSearch = GridSearchCV(estimator=modelDict["object"],param_grid=modelDict["paramsGrid"],cv=kfold,scoring="r2") #If grid search is empty it doesn't do CV, just kfolds
    gridSearch.fit(X_train, y_train)

    #Getting best model
    bestModel = gridSearch.best_estimator_

    #Predictions 
    y_pred = bestModel.predict(X_test)
    

    return [modelDict['name'],mean_squared_error(y_test, y_pred),mean_absolute_error(y_test, y_pred),r2_score(y_test, y_pred),gridSearch.best_params_,bestModel]

In [36]:
#Main

results = pd.DataFrame(columns=['model','mse', 'mae', 'r2','hiperparameters','modelObject'])
times = []
for model in models:
    print(f'Making {model["name"]} model...')
    start = time.time()
    row = modeling(df,modelDict=model)

    end = time.time()
    delta = round(end-start,3)
    times.append(delta)

    results.loc[len(results)] = row

results["duration"] = times

results[[col for col in results.columns if col != "modelObject"]].sort_values(by='r2')

Making Neural Net model...
Making Random Forest model...
Making Linear Regression model...
Making Ridge model...
Making Lasso model...
Making Support Vector Regressor model...
Making XGBoost model...


Unnamed: 0,model,mse,mae,r2,hiperparameters,duration
3,Ridge,0.019681,0.094854,0.621684,{'alpha': 10},0.134
1,Random Forest,0.019552,0.094677,0.624164,"{'max_depth': 5, 'min_samples_split': 3, 'n_estimators': 500}",183.763
0,Neural Net,0.019533,0.096468,0.624519,"{'optimizer__learning_rate': 0.01, 'optimizer__momentum': 0.8}",132.578
2,Linear Regression,0.019465,0.09389,0.625821,{},0.036
5,Support Vector Regressor,0.01946,0.09281,0.625919,"{'C': 2900, 'gamma': 0.001, 'kernel': 'rbf'}",23.946
6,XGBoost,0.019073,0.091634,0.633357,"{'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 20}",2.113
4,Lasso,0.0188,0.093173,0.638601,{'alpha': 0.0001},0.409


### Lasso is the best model, so we'll predict with completely new random data ###

In [37]:
X = df.drop('ViolentCrimesPerPop', axis=1)

In [38]:
np.random.seed(seed)
newX = pd.DataFrame(np.random.rand(X.shape[0]//100, X.shape[1]))
newX.head(4)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,0.417022,0.720324,0.000114,0.302333,0.146756,0.092339,0.18626,0.345561,0.396767,0.538817,...,0.449912,0.57839,0.408137,0.237027,0.90338,0.573679,0.00287,0.617145,0.326645,0.527058
1,0.885942,0.35727,0.908535,0.62336,0.015821,0.929437,0.690897,0.997323,0.172341,0.137136,...,0.907816,0.931972,0.013952,0.234362,0.616778,0.949016,0.950176,0.556653,0.915606,0.641566
2,0.390008,0.485991,0.60431,0.549548,0.926181,0.918733,0.394876,0.963263,0.173956,0.12633,...,0.377924,0.079626,0.982817,0.181613,0.811859,0.874962,0.688413,0.569494,0.160971,0.46688
3,0.345172,0.22504,0.592512,0.31227,0.916306,0.909636,0.257118,0.110891,0.192963,0.499584,...,0.511141,0.540952,0.959434,0.803961,0.032323,0.709387,0.465001,0.947549,0.221433,0.267072


In [39]:
#best model
model = results.loc[results.model=="Lasso","modelObject"].values[0]

This would be the crime per capita with the given new X

In [40]:
model.predict(newX)



array([0.37315167, 0.53604424, 0.35497707, 0.44013775, 0.45916804,
       0.59802373, 0.68405031, 0.78958676, 0.39139749, 0.74999735,
       0.39723091, 0.5382108 , 0.6505681 , 0.65831054, 0.69424443,
       0.79345546, 0.8160518 , 0.55155305, 0.27494148])