# Deep Learning Pipeline Template (KNN & SVM)

In [1]:
import os
os.chdir('/Users/mikelgallo/repos2/DeepL_test')

In [2]:
# imports
#Data Manipulation
import pandas as pd
import numpy as pd

#Model fitting, performance, balancing
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors 
from sklearn.model_selection import train_test_split
from Functions.helper_functions import *
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

In [3]:
#Defining repository path
#data_path = '/Users/mikelgallo/repos2/DeepL_test/Session 1_ Nearest Neighbors/'
data_path = './Session 1_ Nearest Neighbors/data/'

In [10]:
# Read the documentation about the data
df = pd.read_csv(data_path+'housing.csv')

In [14]:
#Highlevel view
print(df.shape)
df.head()

(506, 14)


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## PREPROCESSING

### STEP 1 - Split columns

### Numeric columns (Continuous vs Discrete) Rough estimation

In [46]:
#STEP 1 - Split columns into Continuous, Discrete, Categorical
def numeric_cols(dataset):
    numeric = dataset.select_dtypes(include = np.number).columns.tolist()
    return numeric
#STEP 2 - Rough Estimate of Continuous and Discrete columns
def cont_or_disc(dataset,num):
    dict = {}
    numeric = numeric_cols(dataset)
    df_numeric = dataset[numeric]
    for i in df_numeric:
        count = df_numeric[i].count()
        unique_vals = len(df_numeric[i].unique())
        type = 'Discrete' if unique_vals < num else 'Continuous'
        dtype = df_numeric[i].dtype
        dict[i] = [count, unique_vals, type, dtype]
    result = pd.DataFrame.from_dict(dict,orient='index',columns=['count', 'unique_vals', 'type', 'dtype'])
    return result
#STEP 3 - Return column names for discrete and continuous    
def numeric_col_split(df,num):
    new_df = cont_or_disc(df,num)
    cont_cols = []
    disc_cols = []
    for index,row in new_df.iterrows():
        if row['type'] == 'Discrete':
            disc_cols.append(index)
        elif row['type'] == 'Continuous':
            cont_cols.append(index)
    return disc_cols, cont_cols        

In [48]:
# Review Numeric Columns
cont_or_disc(df,10)

Unnamed: 0,count,unique_vals,type,dtype
crim,506,504,Continuous,float64
zn,506,26,Continuous,float64
indus,506,76,Continuous,float64
chas,506,2,Discrete,int64
nox,506,81,Continuous,float64
rm,506,446,Continuous,float64
age,506,356,Continuous,float64
dis,506,412,Continuous,float64
rad,506,9,Discrete,int64
tax,506,66,Continuous,int64


In [49]:
# Split Discrete and Continuous variables
disc_cols, cont_cols = numeric_col_split(df,10)
print('Discrete columns: ',disc_cols)
print('Continuous columns: ', cont_cols)

Discrete columns:  ['chas', 'rad']
Continuous columns:  ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'b', 'lstat', 'medv']


In [50]:
#Review Discrete variables
df[disc_cols].describe().round(0)

Unnamed: 0,chas,rad
count,506.0,506.0
mean,0.0,10.0
std,0.0,9.0
min,0.0,1.0
25%,0.0,4.0
50%,0.0,5.0
75%,0.0,24.0
max,1.0,24.0


In [54]:
#Dependent variable
print(df['chas'].value_counts())
#Categorical variable
print(df['rad'].value_counts())

chas
0    471
1     35
Name: count, dtype: int64
rad
24    132
5     115
4     110
3      38
6      26
2      24
8      24
1      20
7      17
Name: count, dtype: int64


#### Categorical columns

In [55]:
#How do we Get non-numeric columns? Categoricals
numerics = numeric_cols(df)
non_numeric = [col for col in df.columns if col not in numerics]

In [56]:
non_numeric

[]

#### Create a list to store the results predictions

In [64]:
#Store tuples with a description and the score of election)
results = []

#For each model we need to create different copies of the train Dataframe

### STEP 2 - Imputing/Removing Missing Values


In [61]:
df.isna().mean()

crim       0.0
zn         0.0
indus      0.0
chas       0.0
nox        0.0
rm         0.0
age        0.0
dis        0.0
rad        0.0
tax        0.0
ptratio    0.0
b          0.0
lstat      0.0
medv       0.0
dtype: float64

###### Removing missing values

In [None]:
# To drop all columns with X% of missing values:
#df=df.dropna(axis=1,thresh=0.6*df.shape[0])
# To drop a specific column then: 
#df = df.drop(['column'],axis=1)

#If we want to remove rows with at least 1 missing value
#df = df.dropna()
# E.g. -> X_train0 = X_train.dropna(axis=0,how = "any") # axis=0 removes "any" rows with missing data


#If we want to remove columns with at least 1 missing value
#df = df.dropna(axis = 1)

###### Imputing missing values

In [None]:
####define a list of columns to impute
#missing_cols = df.columns
####Iterate over the list of missing columns to impute both train and test
#for col  in missing_cols:
    #mean = df[col].mean()
    #df_train[col].fillna(mean,inplace=True)
    #df_test[col].fillna(mean,inplace=True)


#Stratified imputation
#my_data[my_col].fillna(my_data.groupby(my_category)[my_col].transform("mean"), inplace=True)


### STEP 3 - Enconding / Creating Dummies and FEATURE ENGINEERING

In [1]:
# Transform Wilderness_Area into dummies
def encode(df, columns_hot):
    encoded_df = pd.get_dummies(df, columns=columns_hot, dtype=float)
    return encoded_df

# Execute function for df and df_test
#df = pd.get_dummies(df, columns=cat_feat, drop_first=False)
#E.g. df = encode(df, ['orientation', 'neighborhood'])
#df = encode(df, ['Wilderness_Area','Soil_Type'])
#df

In [2]:
# Reformating if needed:
#df['is_cover_7'] = np.where(df['Cover_Type'] == 7, 1, 0)

## DATASET Preparation

In [None]:
#Spliting data
from sklearn.model_selection import train_test_split
# In case we use directly a Train and Test datatset
#Xtrain = df_train.drop(['dependent'],axis = 1)
#ytrain = df_train['dependent']
#Xtest = df_test #probably it doesnt require to remove any column

# In case we want to split again our train data in order to measure the performance of our models.
#X_train,X_test,y_train,y_test = train_test_split(Xtrain,ytrain,test_size = 0.2,random_state=34)

## Scaling (Full Train)

In [None]:
from sklearn.preprocessing import StandardScaler
#scaler2 = StandardScaler()
#scaler2.fit(Xtrain[col_cont])

#Xtrain[col_cont] = scaler.transform(Xtrain[col_cont])
#Xtest[col_cont] = scaler.transform(Xtest[col_cont])

### Scaling (Split Train)

In [None]:
from sklearn.preprocessing import StandardScaler
# Defining continuous variables:
#no_cont = ['Index','Soil_Type']
#col_cont= [col for col in columns_cont if col not in no_cont] #Columns cont was defined previously

scaler = StandardScaler()
#We will only fit our scaler object with continuous variables
scaler.fit(X_train[col_cont])

# Fiting scaler with continuos variables
X_train[col_cont] = scaler.transform(X_train[col_cont])
X_test[col_cont] = scaler.transform(X_test[col_cont])

## Running KNN

In [None]:
# Step 1 - Create an instance of the KNN model
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

KNN_model  = KNeighborsClassifier(n_neighbors=20, algorithm = 'brute')
KNN_model.fit(Xtrain,ytrain)

print(f"Predicted class is ",KNN_model.predict(Xtest))

## Check which other algorithm should we try

## Hypertuning KNN

In [None]:
# Step 1 - Create an instance of the KNN model
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

KNN_model  = KNeighborsClassifier(algorithm = 'brute')

# Step 2 - Create our grid_search_values
grid_values = {'n_neighbors': [5, 15, 25, 30, 35, 50], 'weights': ['uniform','distance']}

# Step 3 - Instanciate our gridSearch CV
grid_knn_acc = GridSearchCV(KNN_model, param_grid = grid_values, scoring='roc_auc',cv=20)

#Step 4 - fit model 
grid_knn_acc.fit(X_train,y_train)

In [None]:
# Report best Number of Neighbors
GridSearch_table_plot(grid_knn_acc, "n_neighbors", negative=False, display_all_params=False)

In [None]:
#Best Result
print('best parameters:', grid_knn_acc.best_params_)
print('best score:', grid_knn_acc.best_score_)

In [None]:
#Calculating Performance Scores
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
#InSample (Train)
insample_y_predict = grid_knn_acc.predict(X_train)

print('the accuracy score for best estimator: ', accuracy_score(y_train,insample_y_predict))
print('the precision score for best estimator: ', precision_score(y_train,insample_y_predict))
print('the recall score for best estimator: ', recall_score(y_train,insample_y_predict))
print('the f1_score score for best estimator: ', f1_score(y_train,insample_y_predict))

In [None]:
#Out of Sample (Test)
y_predict = grid_knn_acc.predict(X_test)

print('the accuracy score for best estimator: ', accuracy_score(y_test,y_predict))
print('the precision score for best estimator: ', precision_score(y_test,y_predict))
print('the recall score for best estimator: ', recall_score(y_test,y_predict))
print('the f1_score score for best estimator: ', f1_score(y_test,y_predict))

### Running SVM

In [None]:
# Create a Support Vector Machine classifier with a linear kernel
classifier = SVC(kernel='linear', probability=True)

# Train the classifier on the training set
classifier.fit(X_train_scaled, y_train)

# Obtain decision values or probabilities on the training set
y_train_scores = classifier.decision_function(X_train_scaled)  # Use decision_function for linear kernel
# Alternatively, you can use predict_proba for non-linear kernels
# y_train_probs = classifier.predict_proba(X_train_scaled)[:, 1]

# Calculate AUC score on the training set
auc_score_train = roc_auc_score(y_train, y_train_scores)

# Print or use the AUC score as needed
print("AUC score on the training set:", auc_score_train)

### Hypertuning SVM

In [None]:
#Target is continuous
MySvr = SVR()
grid_values = {'C':[0.01, 0.1, 1, 10], 'epsilon':[0.1, 0.5, 0.75], 'kernel':['linear', 'poly', 'rbf',]}
grid_svr = GridSearchCV(MySvr, param_grid=grid_values, scoring='r2', cv=5, n_jobs=7)
grid_model = grid_svr.fit(X, y)

## Classification
my_SVM_model = SVC()
grid_values = {'C':[0.1,0.2, 0.3, 0.5, 1, 10], 'gamma':[0.25,0.5,0.75, 1, 1.25, 1.5], 'kernel':['linear', 'rbf']}
grid_svc_acc = GridSearchCV(my_SVM_model, param_grid = grid_values,scoring = 'roc_auc', cv=5)

In [None]:
#Plotting Results
GridSearch_table_plot(grid_model, "kernel", negative=False, display_all_params=False)

In [None]:
# report best hyperparameters
print('Best Cost parameter : '+ str(grid_model.best_estimator_.C))
print('Best epsilon parameter : '+ str(grid_model.best_estimator_.epsilon))
print('Best kernel parameter : '+ str(grid_model.best_estimator_.kernel))

In [None]:
# predict
y_pred = grid_model.predict(X)

In [None]:
# check in-sample performance
print('In-sample R2 Score : '+ str(r2_score(np.exp(y),np.exp(y_pred))))

## Final Submission

In [None]:
# run chosen model on entire train dataframe
model = Lasso(random_state=42, alpha = 1.5 )
model.fit(df.drop('price', axis=1), df['price'])

# predict test values
test_predictions = model.predict(df_test)

# create csv file with results to submit
test_prediction_submit = pd.DataFrame({"id": df_test_id["id"],  "price": test_predictions})
test_prediction_submit.to_csv("test_submit.csv", index = False)