In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
%matplotlib inline

In [2]:
df=pd.read_csv("Boston_catg.csv" )

In [3]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,Low
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,Low
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,High
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,High
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,High


In [4]:
df.shape

(506, 14)

In [5]:
df.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
df.shape

(394, 14)

In [8]:
dataSeries=df.dtypes   
dataSeries

CRIM       float64
ZN         float64
INDUS      float64
CHAS       float64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX          int64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV        object
dtype: object

In [9]:
# Encoding the values of medv column 

from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()

df.MEDV = encode.fit_transform(df.MEDV)

# displaying the data set to check the encoded values 
df.head()

[0.02731,0.0,0.00,0.0,0.46,3.26,00.0,1.28,0,0,00.0,0,0.00]

[0.02731, 0.0, 0.0, 0.0, 0.46, 3.26, 0.0, 1.28, 0, 0, 0.0, 0, 0.0]

In [10]:
#checking class imbalance
df['MEDV'].value_counts()

1    275
0    119
Name: MEDV, dtype: int64

In [11]:
class_imbalance=275/(275+119)
class_imbalance

0.6979695431472082

In [12]:
# seperate the target and independent variable

X= df.drop(['MEDV'], axis = 1)
y= df['MEDV']

In [13]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [15]:
sc

StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
print("Shape of X_train: ",X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test",y_test.shape)

Shape of X_train:  (275, 13)
Shape of X_test:  (119, 13)
Shape of y_train:  (275,)
Shape of y_test (119,)


In [17]:
y_train

224    0
420    1
101    0
86     1
468    1
      ..
484    1
408    1
178    0
410    1
164    1
Name: MEDV, Length: 275, dtype: int32

## Logistic Regression 

In [18]:
# create the object of the model
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()

model1.fit(X_train_std,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
    pred1 = model1.predict(X_test_std)

In [20]:
from sklearn.metrics import accuracy_score

print('Predicted Values on Test Data',(pred1))

print('Logistic Regression Train Score: ',model1.score(X_train_std, y_train))
print('Logistic Regression Test Score: ',model1.score(X_test_std, y_test))

print('Accuracy Score on test data :',accuracy_score(y_test,pred1))

Predicted Values on Test Data [1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 1 1 1 0 0 1
 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 0 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1
 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 0 1 1 0 1 1 0 0 1
 1 1 1 0 1 0 0 1]
Logistic Regression Train Score:  0.9018181818181819
Logistic Regression Test Score:  0.9243697478991597
Accuracy Score on test data : 0.9243697478991597


Confusion Matrix 
A confusion matrix is an N X N matrix, where N is the number of classes being predicted. 
Confusion Matrix gives us a matrix as output and describes the complete performance of the model.

The correct predictions falls on the diagonal line of the matrix.

4 important terms in Confusion Matrix:

True Positives : The cases in which we predicted YES and the actual output was also YES.

False Negatives(Type 2 error) : The cases in which we predicted NO and the actual output was YES.

False Positives(Type 1 error) : The cases in which we predicted YES and the actual output was NO.

True Negatives : The cases in which we predicted NO and the actual output was NO.

The Confusion matrix in itself is not a performance measure as such,
but almost all of the performance metrics are based on Confusion Matrix and the numbers inside it.

In [21]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,pred1)
print(confusion_matrix)

[[32  6]
 [ 3 78]]


In [22]:
from sklearn.metrics import confusion_matrix
import numpy as np 
cm=np.array(confusion_matrix(y_test,pred1))
df_confusion=pd.DataFrame(cm,index=["Positive","Negative"], columns=["Positive","Negative"])
print(df_confusion)
print('Accuracy through Logistic regression :',accuracy_score(y_test,pred1))

          Positive  Negative
Positive        32         6
Negative         3        78
Accuracy through Logistic regression : 0.9243697478991597


The result is telling us that we have 32+78 correct predictions and 3+6 incorrect predictions

## Support vector Machine  when kernel is random = rbf

In [23]:
from sklearn import svm

#create a classifier
Model2 = svm.SVC()

#train the model
Model2.fit(X_train_std,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [24]:
#predict the response
pred2 = Model2.predict(X_test_std)

In [25]:
from sklearn import metrics
print('Predicted Values on Test Data',(pred2))

print('SVM Train Score with RBF kernel: ',Model2.score(X_train_std, y_train))
print('SVM Test Score with RBF kernel: ',Model2.score(X_test_std, y_test))

from sklearn.metrics import accuracy_score
#accuracy
print("Model Accuracy through SVM:", metrics.accuracy_score(y_test,y_pred=pred2))

Predicted Values on Test Data [1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 1 1 1 0 0 1
 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 0 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1
 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1
 1 1 1 0 1 0 1 1]
SVM Train Score with RBF kernel:  0.92
SVM Test Score with RBF kernel:  0.8991596638655462
Model Accuracy through SVM: 0.8991596638655462


In [26]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test,pred2)
print(confusion_matrix)

[[29  9]
 [ 3 78]]


In [27]:
from sklearn.metrics import confusion_matrix
import numpy as np 
cm=np.array(confusion_matrix(y_test,pred2))
df_confusion=pd.DataFrame(cm,index=["Positive","Negative"], columns=["Positive","Negative"])
print(df_confusion)
print('Accuracy through SVM RBF :',accuracy_score(y_test,pred2))

          Positive  Negative
Positive        29         9
Negative         3        78
Accuracy through SVM RBF : 0.8991596638655462


The result is telling us that we have 29+78 correct predictions and 3+9 incorrect predictions

### Support vector machine with kernel linear

In [28]:
from sklearn import svm

#create a classifier
Model3 = svm.SVC(kernel="linear")

#train the model
Model3.fit(X_train_std,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [29]:
#predict the response
pred3 = Model3.predict(X_test_std)

In [30]:
from sklearn import metrics
print('Predicted Values on Test Data',(pred3))

print('SVM Train Score with linear kernel : ',Model3.score(X_train_std, y_train))
print('SVM Regression Test Score with linear kernel: ',Model3.score(X_test_std, y_test))

from sklearn.metrics import accuracy_score
#accuracy
print("Model Accuracy through SVM with linear kernel:", metrics.accuracy_score(y_test,y_pred=pred3))

Predicted Values on Test Data [1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 1 1 1 0 0 1
 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 0 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1
 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1
 1 1 1 0 1 0 0 1]
SVM Train Score with linear kernel :  0.9054545454545454
SVM Regression Test Score with linear kernel:  0.8991596638655462
Model Accuracy through SVM with linear kernel: 0.8991596638655462


In [31]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,pred3)
print(confusion_matrix)

[[31  7]
 [ 5 76]]


In [32]:
from sklearn.metrics import confusion_matrix
import numpy as np 
cm=np.array(confusion_matrix(y_test,pred3))
df_confusion=pd.DataFrame(cm,index=["Positive","Negative"], columns=["Positive","Negative"])
print(df_confusion)
print('Accuracy through SVM Linear :',accuracy_score(y_test,pred3))

          Positive  Negative
Positive        31         7
Negative         5        76
Accuracy through SVM Linear : 0.8991596638655462


The result is telling us that we have 31+76  correct predictions and 7+5 incorrect predictions

### Candidate model : Logistic regression

In [33]:
# Crime rate, Pollution, DIS,Rooms, Budget = 20k $
Working_Professional = [0.02731,0.0,0.00,0.0,0.46,0.00,00.0,0.00,0,0,00.0,0,0.00], 

value = model1.predict(Working_Professional)
value

array([1])

In [34]:
# Crime rate, PTRATIO, number of room, budget = 30k $
Rich_couple = [4,0.0,0.00,0.0,0.0,6,0.00,0.00,0,0,25,0,0.00], 

value = model1.predict(Rich_couple)
value

array([1])