# Binary Classifier using Linear Regression

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from sklearn import linear_model

In [None]:
data = np.loadtxt('rawData1.txt', delimiter=',')

In [None]:
#np.random.shuffle(data) # randomize the examples of data

In [None]:
[k,l] = data.shape # dimensions of data

In [None]:
print "The data has %i examples and %i features." %(k,l)

In [None]:
print "The features are 'Evaluation 1', 'Evaluation 2', 'Disease State'."

In [None]:
data[:10,:] # print data (first 10 examples only)

In [None]:
m = int(np.ceil(0.60 * k)); # m=(number of training examples)

In [None]:
print "Training set: %i examples (60%% of the data)." %m 
print "Cross Validation set: %i examples (the other 40%%)." %(k-m) 

In [None]:
dataTrain = data[:m,:] # first 60 examples of data for training

In [None]:
dataCV = data[m:,:] # last 40 examples of data for cross validation

In [None]:
X = dataTrain[:, :2] # define X (training predictors)

In [None]:
y = dataTrain[:, 2] # define y (training responses)

In [None]:
Xcv = dataCV[:, :2] # define Xcv (cross validation predictors)

In [None]:
ycv = dataCV[:, 2] # define ycv (cross validation responses)

In [None]:
pos = dataTrain[y == 1]; neg = dataTrain[y == 0] # split up positive/negative training examples

In [None]:
poscv = dataCV[ycv == 1]; negcv = dataCV[ycv == 0] # split up positive/negative cross validation examples

In [None]:
[P, N] = [pos.shape[0] , neg.shape[0]] # number of positive and negative examples
print "There are %i positive training examples and %i negative training examples." %(P,N)

if max(float(P)/m, float(N)/m) >= 0.80:
    print "We have skewed classes." 
else:
    print "We don't have skewed classes."

In [None]:
# Plot the training points
plt.figure()
pscatt = plt.scatter(pos[:,0], pos[:,1], color = 'c', marker = 'o', edgecolors='k')
nscatt = plt.scatter(neg[:,0], neg[:,1], color = 'm', marker = 'o', edgecolors='k')
plt.xlabel('Evaluation 1')
plt.ylabel('Evaluation 2')
plt.legend([pscatt, nscatt], ['Diseased', 'Not Diseased'], loc="lower center", ncol=2)
plt.title('Scatter Plot of Training Examples')
x_min, x_max = X[:,0].min() - 10, X[:,0].max() + 10
y_min, y_max = X[:,1].min() - 25, X[:,1].max() + 20
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.show()
plt.close()

In [None]:
print "Statistics of 'Evaluation 1': (Numbers are rounded for easy reading) \n"
mu1 = np.mean(X[:,0])
print "The mean is %i." %mu1
s1 = np.std(X[:,0])
print "The standard deviation is %i." %s1
[min1, max1] = [np.min(X[:,0]), np.max(X[:,0])]
print "The range is %i to %i." %(min1,max1)

In [None]:
print "Statistics of 'Evaluation 2': (Numbers are rounded for easy reading) \n"
mu2 = np.mean(X[:,1])
print "The mean is %i." %mu2
s2 = np.std(X[:,1])
print "The standard deviation is %i." %s2
[min2, max2] = [np.min(X[:,1]), np.max(X[:,1])]
print "The range is %i to %i." %(min2, max2)

In [None]:
# Histogram for 'Evaluation 1'
plt.figure(1)

# create parameters for histogram
n, bins, patches = plt.hist(X[:,0], 20, normed=1, facecolor = 'g', cumulative=False)

# plot options
plt.xlabel('Evaluation 1')
plt.ylabel('Relative Frequency')
plt.title("Histogram of 'Evaluation 1'")
plt.axis([30, 100, 0, 0.05])
plt.grid(True)

# add a normal curve
norms = mlab.normpdf(bins, mu1, s1)
plt.plot(bins, norms, 'k--', linewidth=1)


# Histogram for 'Evaluation 2'
plt.figure(2)

# create parameters for histogram
n, bins, patches = plt.hist(X[:,1], 20, normed=1, facecolor = 'b', cumulative=False)

# plot options
plt.xlabel('Evaluation 2')
plt.ylabel('Relative Frequency')
plt.title("Histogram of 'Evaluation 2'")
plt.axis([30, 100, 0, 0.05])
plt.grid(True)

# add a normal curve
norms = mlab.normpdf(bins, mu2, s2)
plt.plot(bins, norms, 'k--', linewidth=1)


plt.show()
plt.close()

In [None]:
# boxplots of 'Evaluation 1' and 'Evaluation 2'
plt.figure()

bp = plt.boxplot(X, notch=True, showmeans=True, vert=False, labels=['Evaluation 1', 'Evaluation 2'], patch_artist=True)
colors=['green', 'blue']

for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)

plt.title("Boxplots of 'Evaluation 1' and 'Evaluation 2'")    
plt.show()
plt.close()

In [None]:
logreg = linear_model.LogisticRegression(C=1.0e2) # logistic regression model with specified C (regularization)

In [None]:
logreg.fit(X,y) # fit logistic regression model

In [None]:
ta = logreg.score(X,y) # training accuracy
ta = ta * 100 # convert to percentage

print "The training accuracy is %i%%." %ta

if ta >= 90:
    comment1 = "Good."
elif ta >= 85:
    comment1 = "Not bad."
elif ta >= 75:
    comment1 = "A bit low."
else:
    comment1 = "Very low."

print "%s" %comment1 

In [None]:
cva = logreg.score(Xcv,ycv) # cross validation accuracy
cva = cva * 100 # convert to percentage

print "The cross validation accuracy is %i%%." %cva 

if cva >= 90:
    comment2 =  "Good."
elif cva >= 85:
    comment2 = "Not bad."
elif cva >= 75:
    comment2 = "A bit low."
else:
    comment2 = "Very low."
    
print "%s" %comment2 

In [None]:
# Plot for Training Examples and Cross Validation Examples

# step size in the mesh (make sure that h > 0.1)
h = .3 

# plot the decision boundary by assigning a color to each point in the mesh
x_min, x_max = X[:,0].min() - 10, X[:,0].max() + 10
y_min, y_max = X[:,1].min() - 25, X[:,1].max() + 20
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the training points
plt.figure(1)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
pscatt = plt.scatter(pos[:,0], pos[:,1], color = 'c', marker = 'o', edgecolors='k')
nscatt = plt.scatter(neg[:,0], neg[:,1], color = 'm', marker = 'o', edgecolors='k')
plt.xlabel('Evaluation 1')
plt.ylabel('Evaluation 2')
plt.legend([pscatt, nscatt], ['Diseased', 'Not Diseased'], loc="lower left", ncol=2)
plt.title('Training Examples with Decision Boundary \n Prediction Accuracy: %i%% (%s)' %(ta, comment1))
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

# Plot the cv points
plt.figure(2)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
pscatt = plt.scatter(poscv[:,0], poscv[:,1], color = 'c', marker = 's', edgecolors='k')
nscatt = plt.scatter(negcv[:,0], negcv[:,1], color = 'm', marker = 's', edgecolors='k')
plt.xlabel('Evaluation 1')
plt.ylabel('Evaluation 2')
plt.legend([pscatt, nscatt], ['Diseased', 'Not Diseased'], loc="lower left", ncol=2)
plt.title('Cross Validation Examples with Decision Boundary \n Prediction Accuracy: %i%% (%s)' %(cva, comment2))
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

plt.show()
plt.close()