# Skeleton Code for Machine Learning Models

Includes:
- Logistic Regression
- K-Nearest Neighbors
- Perceptron 
- Random Forest
- Support Vector Machine Learning
- Ridge Regression
- Elastic Net
- K-Fold Cross Validation

In [None]:
#import numpy and pandas

import numpy as np
import pandas as pd

In [None]:
# import machine learning libraries

import sklearn.linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('') #create df, insert file name
df.head()

In [None]:
#split data into training and test sets
from sklearn.model_selection import train_test_split

x = df.iloc[:,:-1] #extract all but last column
y = np.ravel(df[['Response']]) #extract last column

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

#make sure to remove or replace all NaN values!

# Logistic Regression

Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
logreg = LogisticRegression(solver= 'liblinear',multi_class='ovr', max_iter=1000) #instantiate                            
logreg.fit(X_train, y_train) #fit                              

acc_logreg_train = logreg.score(X_train, y_train)  # predict + evaluate (training) 
acc_logreg_test = logreg.score(X_test, y_test)     # predict + evaluate (training) 
 
print('Logistic Regression Training Labeling Accuracy:', str(round(acc_logreg_train,3)*100),'%')             
print('Logistic Regression Test Labeling Accuracy:', str(round(acc_logreg_test,3)*100),'%')

# K-Nearest Neighbors

Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)            # instantiate               
knn.fit(X_train, y_train)                              # fit                                   
acc_knn_train = knn.score(X_train, y_train)            # predict + evaluate (training)                    
acc_knn_test = knn.score(X_test, y_test)               # predict + evaluate (test)

print('KNN Training Labeling Accuracy:', str(round(acc_knn_train*100,3)),'%')
print('KNN Test Labeling Accuracy:', str(round(acc_knn_test*100,3)),'%')

# Perceptron

Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html

In [None]:
perceptron = Perceptron(max_iter=10000, tol=None)              # instantiate                                  
perceptron.fit(X_train, y_train)                               # fit                          
acc_perceptron_train = perceptron.score(X_train, y_train)      # predict + evaluate (training)
acc_perceptron_test = perceptron.score(X_test, y_test)         # predict + evaluate (test)

print('Perceptron Training Labeling Accuracy:', str(round(acc_perceptron_train*100,3)),'%')
print('Perceptron Test Labeling Accuracy:', str(round(acc_perceptron_test*100,3)),'%')

# Random Forest

Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
random_forest = RandomForestClassifier(n_estimators=10000,class_weight='balanced')# instantiate   
random_forest.fit(X_train, y_train)                                               # fit                                      
acc_rf_train = random_forest.score(X_train, y_train)                              # predict + evaluate (training)
acc_rf_test = random_forest.score(X_test, y_test)                                 # predict + evaluate (test)
print('Random Forest Training Labeling Accuracy:', str(round(acc_rf_train*100,3)),'%')
print('Random Forest Test Labeling Accuracy:', str(round(acc_rf_test*100,3)),'%')

# Support Vector Machine Learning

Documentation: https://scikit-learn.org/stable/modules/svm.html

In [None]:
clf = svm.SVC(gamma='scale')                             # instantiate
clf.fit(X_train, y_train)                                # fit
acc_clf_train = clf.score(X_train, y_train)              # predict + evaluate (training)
acc_clf_test = clf.score(X_test, y_test)                 # predict + evaluate (test)
print('SVM Training Labeling Accuracy:', str(round(acc_clf_train*100,3)),'%')
print('SVM Test Labeling Accuracy:', str(round(acc_clf_test*100,3)),'%')

# Ridge Regression

Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

In [None]:
ridgereg = LogisticRegression(solver= 'liblinear',multi_class='ovr', max_iter=1000) #instantiate                            
ridgereg.fit(X_train, y_train) #fit                              

acc_ridgereg_train = ridgereg.score(X_train, y_train) # predict + evaluate (training) 
acc_ridgereg_test = ridgereg.score(X_test, y_test) # predict + evaluate (training) 
 
print('Ridge Regression Training Labeling Accuracy:', str(round(acc_ridgereg_train,3)*100),'%')             
print('Ridge Regression Test Labeling Accuracy:', str(round(acc_ridgereg_test,3)*100),'%')

# Elastic Net

Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html

In [None]:
elastic_net = ElasticNet(random_state=0)
elastic_net.fit(X_train, y_train)

acc_elastic_net_train = elastic_net.score(X_train, y_train) # predict + evaluate (training) 
acc_elastic_net_test = elastic_net.score(X_test, y_test) # predict + evaluate (training) 
 
print('Elastic Net Training Labeling Accuracy:', str(round(acc_elastic_net_train,3)*100),'%')             
print('Elastic Net Test Labeling Accuracy:', str(round(acc_elastic_net_test,3)*100),'%')

# Comparing Results

In [None]:
#displays dataframe with test accuracies from each machine learning model

d = {'Machine Learning Model':['Logistic Regression', 'KNN', 'Perceptron', 'Random Forest', 'Support Vector', 'Ridge Regression', 'Elastic Net'], \
        'Test Accuracy (Percent)': [round(acc_logreg_test*100,3),round(acc_knn_test*100,3), round(acc_perceptron_test*100,3), \
                                    round(acc_rf_test*100,3),round(acc_clf_test*100,3), round(acc_ridgereg_test*100,3), round(acc_elastic_net_test*100,3)]}
display(pd.DataFrame(data=d))

# K-Fold Cross Validation for Further Evaluation

Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

In [None]:
#assign M to any machine learning model used above (logreg, knn, perceptron, random_forest, clf, ridgereg, elastic_net)

kf = KFold(n_splits=3, shuffle=True, random_state=5)   # instantiate
M = LogisticRegression(solver= 'liblinear',multi_class='ovr') #replace 
scores = cross_val_score(M,x,y,cv=kf)                 # fit + predict + eval. (uses all data)
print('Average Accuracy:', str(round(scores.mean()*100,3)),'%')