# Data modeling #

In [1]:
#Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#Feature selection
from sklearn.feature_selection import RFE

#Lineal regression
from sklearn.linear_model import LinearRegression
#Ridge
from sklearn.linear_model import Ridge, Lasso
#Decission trees
from sklearn.tree import DecisionTreeRegressor
#Random forest
from sklearn.ensemble import RandomForestRegressor
#Support Vector Machine (SVR)
from sklearn.svm import SVR
#Neural Nets
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from scikeras.wrappers import KerasRegressor
from keras.optimizers import SGD, Adam
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#XGboost
import xgboost as xgb

#Performance metrics
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error, r2_score,accuracy_score

#Cross validation and train-test split
from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict, KFold

#Grid Search
from sklearn.model_selection import GridSearchCV

In [12]:
df = pd.read_csv('cleanedData.csv')
df = df[df.columns[1:]]


# Outliers

In [13]:
df.shape

(1994, 103)

In [14]:
outliersPercentage = {}

for column in df.select_dtypes(include=[np.number]):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Using .loc to find outliers
        outliers = df.loc[(df[column] < lower_bound) | (df[column] > upper_bound), column]
        outliersPercentage[column] = round(outliers.size/df.size,2)

pd.DataFrame(outliersPercentage)

ValueError: If using all scalar values, you must pass an index

In [3]:
seed = 1

In [4]:
#Models

lr = {"name":"Linear Regression",
      "object": LinearRegression(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

rf = {"name":"Random Forest",
      "object": RandomForestRegressor(criterion='squared_error',
                               min_samples_leaf=2,
                               bootstrap=True,
                               oob_score=False,
                               random_state=seed,
                               verbose=0),
      "paramsGrid": {'max_depth': range(1, 11), 'min_samples_split': range(10, 60, 10),'n_estimators': range(1,100,10)},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

ridge = {"name":"Ridge",
      "object": Ridge(),
      "paramsGrid": {'alpha':[10,20,29,29,30,31,32,33,35,40,45,50]},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

lasso = {"name":"Lasso",
      "object": Lasso(max_iter=10000),
      "paramsGrid": {'alpha':[1e-4,1e-3,1e-2,1,5,10,20,30,35]},
      "requires_feature_selection": False,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

SVR = {"name":"Support Vector Regressor",
      "object": LinearRegression(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

nn = {"name":"Neural Net",
      "object": LinearRegression(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}

xgb = {"name":"XGBoost",
      "object": LinearRegression(),
      "paramsGrid": {},
      "requires_feature_selection": True,
      "requires_scalling":True,
      "needs_outliers_handling":True
}


models = [lr,rf]

In [5]:
def modeling(df,modelDict,seed=1,threshold_for_selection=.3,test_size=.3,cv_splits=5):
    
    #Feature selection
    if modelDict["requires_feature_selection"] == True:
        corr = df.corr()["ViolentCrimesPerPop"].sort_values(ascending=False)
        #Creates a new dataframe with the selected columns
        df = df[corr[corr > threshold_for_selection].index] 
    
    #X and Y
    X = df.drop('ViolentCrimesPerPop', axis=1)
    Y = df['ViolentCrimesPerPop']

   

    #Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size,random_state=seed)


    #Scalling
    if modelDict["requires_scalling"] == True:
        sc = StandardScaler()
        X_train, X_test = sc.fit_transform(X_train), sc.fit_transform(X_test)
    

    #Kfold object
    kfold = KFold(n_splits=cv_splits, random_state=seed, shuffle=True)

    #Grid Search
    gridSearch = GridSearchCV(estimator=modelDict["object"],param_grid=modelDict["paramsGrid"],cv=kfold,scoring="r2") #If grid search is empty it doesn't do CV, just kfolds
    gridSearch.fit(X_train, y_train)

    #Getting best model
    bestModel = gridSearch.best_estimator_

    #Predictions 
    y_pred = bestModel.predict(X_test)

    #Scoring
    scores = {'name': modelDict['name'],
              'mse':mean_squared_error(y_test, y_pred),
              'mae':mean_absolute_error(y_test, y_pred),
              'r2':r2_score(y_test, y_pred),}
              #'accuracy':accuracy_score(y_test, y_pred)}
    

    return [modelDict['name'],mean_squared_error(y_test, y_pred),mean_absolute_error(y_test, y_pred),r2_score(y_test, y_pred),gridSearch.best_params_]

In [6]:
#Main

results = pd.DataFrame(columns=['model','mse', 'mae', 'r2','hiperparameters'])

for model in models:

    row = modeling(df,modelDict=model)
    results.loc[len(results)] = row

results

KeyboardInterrupt: 