# lawn ownership Prediction with Logistic Regression



## 1. Setup

In [94]:
# Common imports
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

np.random.seed(1)

# 2. Load the data

We will use the lawn data that we cleaned in last class (the original, not the one that you altered for last weeks exercise).

In [95]:
# Uncomment the following snippet of code to debug problems with finding the .csv file path
# This snippet of code will exit the program and print the current working directory.
#import os
#print(os.getcwd())

In [96]:
X_train = pd.read_csv("./data/lawn_train_X_ownership.csv")
X_test = pd.read_csv("./data/lawn_test_X_ownership.csv")
y_train = pd.read_csv("./data/lawn_train_y_ownership.csv")
y_test = pd.read_csv("./data/lawn_test_y_ownership.csv")

## 3. Model the data

First, we will create a dataframe to hold all the results of our models.

In [97]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### 3.1 Fit and test a Logistic Regression model

In [98]:
from sklearn.svm import SVC

In [99]:
#svm with linear kernel
svm_lin_model = SVC(kernel="linear",probability=True)
_ = svm_lin_model.fit(X_train, np.ravel(y_train))

In [100]:
model_preds = svm_lin_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [101]:
#svm with rbf kernel
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale',probability=True)
_ = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [102]:
model_preds = svm_rbf_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [103]:
#svm with poly kernal
svm_poly_model = SVC(kernel="poly",probability=True, degree=3, coef0=1, C=10)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

In [104]:
model_preds = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## 5.0 Summary

Sorted by accuracy, the best models are:

In [105]:
performance.sort_values(by=['Accuracy'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.666667,0.75,0.6,0.666667
0,linear svm,0.777778,0.8,0.8,0.8
0,poly svm,0.777778,0.8,0.8,0.8


Sorted by Precision, the best models are:

In [106]:
performance.sort_values(by=['Precision'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.666667,0.75,0.6,0.666667
0,linear svm,0.777778,0.8,0.8,0.8
0,poly svm,0.777778,0.8,0.8,0.8


Sorted by Recall, the best models are:

In [107]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.666667,0.75,0.6,0.666667
0,linear svm,0.777778,0.8,0.8,0.8
0,poly svm,0.777778,0.8,0.8,0.8


Sorted by F1, the best models are:

In [108]:
performance.sort_values(by=['F1'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.666667,0.75,0.6,0.666667
0,linear svm,0.777778,0.8,0.8,0.8
0,poly svm,0.777778,0.8,0.8,0.8


### So which model is the 'best' and the one you wish to choose?
so when compared for all the models the different measures like accuracy,precision, recall and f1 we can say that poly svm model is better performing than others.


In [109]:
import pickle

pickle.dump(svm_poly_model, open('C:/Users/mukes/OneDrive/Desktop/classes/DSP/lawn_mover.csv', 'wb'))