<a href="https://colab.research.google.com/github/nisanuro/CNG562-Assignment-1/blob/master/CNG562_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CNG 562 - Assignment #1**

Linear Regression vs Logistic Regression using Iris Dataset\
Comparing:
*   Random 1-Hold Out
*   5-Fold
*   10-Fold
*   Strafied 1-Hold Out

\
Nisa Nur Odabaş\
Kaan Taha Köken

---



In [0]:
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics, datasets, preprocessing
from sklearn.metrics import roc_curve, auc
%matplotlib inline

**K-Fold method**

In [0]:
def kFold(foldNumber, X_train, Y_train):

  kf = KFold(n_splits=foldNumber, shuffle=False)  

  logReg = LogisticRegression(solver='liblinear', multi_class='ovr')
  linReg = LinearRegression()  

  cv_result_log = cross_val_score(logReg, X_train, Y_train, cv=kf, scoring='accuracy')
  cv_result_lin = cross_val_score(linReg, X_train, Y_train, cv=kf, scoring='neg_mean_squared_error')

  print(str(foldNumber) + "Fold")
  print("Logistic Regression Accuracy: ", cv_result_log.mean())
  print("Linear Regression Accuracy: ", 1 + cv_result_lin.mean())

**Random 1-Hold Out method**

In [0]:
def randomOneHoldout(X_train, Y_train):

  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)
  
  logReg = LogisticRegression(solver='liblinear', multi_class='ovr')
  linReg = LinearRegression()

  logReg.fit(x_train, y_train)
  linReg.fit(x_train, y_train)

  y_pred_log = logReg.predict(x_test)
  y_pred_lin = linReg.predict(x_test)
  
  print("Random One Hold Out")
  print("Logistic Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_log))
  print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_lin))

**Stratified 1-Hold Out method**

In [0]:
def stratifiedOneHoldout(X_train, Y_train):
  
  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=1, stratify=Y_train)
  
  logReg = LogisticRegression(solver='liblinear', multi_class='ovr')
  linReg = LinearRegression()

  logReg.fit(x_train, y_train)
  linReg.fit(x_train, y_train)

  y_pred_log = logReg.predict(x_test)
  y_pred_lin = linReg.predict(x_test)
  
  print("Stratified")
  print("Logistic Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_log))
  print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_lin))

**Displaying accuracies for all validation methods**

In [0]:
def displayAccuracy(X, Y):
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
    
    kFold(5, X_train, Y_train)    
    kFold(10, X_train, Y_train)    
    randomOneHoldout(X_train, Y_train)
    stratifiedOneHoldout(X_train, Y_train)

**Round method for linear regression prediction**

In [0]:
def roundPredict(p):
    r = p.copy()
    for i in range(len(r)):
        if r[i] <= 0.5: r[i] = 0
        elif r[i] >= 1.5: r[i] = 2
        else: r[i] = 1
    return r

**Main**

In [138]:
if __name__ == '__main__':

  iris = datasets.load_iris()
  
  X = iris.data
  Y = iris.target
  
  # L1 normalization
  l1_norm = preprocessing.normalize(X, norm="l1")
  # Mean removal
  mean_removal = preprocessing.scale(X)

  '''#mean & standart deviation before mean removal 
  print(X.mean(axis=0))
  print(X.std(axis=0))

  #mean & standart deviation after mean removal 
  print(mean_removal.mean(axis=0))
  print(mean_removal.std(axis=0))'''

  #Displaying result according to each type of methods and regression model
  print("\nRaw: ")
  displayAccuracy(X,Y)
  print("\nL1 Normalization: ")
  displayAccuracy(l1_norm,Y)
  print("\nMean Removal: ")
  displayAccuracy(mean_removal,Y)


Raw: 
5Fold
Logistic Regression Accuracy:  0.9416666666666668
Linear Regression Accuracy:  0.9505573483173648
10Fold
Logistic Regression Accuracy:  0.9333333333333332
Linear Regression Accuracy:  0.9504757453551504
Random One Hold Out
Logistic Regression Accuracy:  0.9583333333333334
Linear Regression Accuracy:  0.9596940114241354
Stratified
Logistic Regression Accuracy:  0.9583333333333334
Linear Regression Accuracy:  0.9613449341091412

L1 Normalization: 
5Fold
Logistic Regression Accuracy:  0.6916666666666667
Linear Regression Accuracy:  0.9249382972717285
10Fold
Logistic Regression Accuracy:  0.6916666666666667
Linear Regression Accuracy:  0.9228036562601725
Random One Hold Out
Logistic Regression Accuracy:  0.75
Linear Regression Accuracy:  0.9311930338541666
Stratified
Logistic Regression Accuracy:  0.7083333333333333
Linear Regression Accuracy:  0.9382756551106771

Mean Removal: 
5Fold
Logistic Regression Accuracy:  0.8916666666666668
Linear Regression Accuracy:  0.950557348317

# **Final**
**Training and Testing using:**
* **Raw data**
* **Stratified 1-Hold Out**
* **Linear Regression**






**Dividing Train and Test sets**

In [0]:
 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1, stratify=Y)

**Training**

In [140]:
  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=1, stratify=Y_train)

  linReg = LinearRegression()
  linReg.fit(x_train, y_train)

  y_pred_lin = linReg.predict(x_test)
  rounded_lin = roundPredict(y_pred_lin)

  print("Rounded Stratify One Hold Out - Only train set")
  print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, rounded_lin))
  print("Linear Regression R^2 score: ", metrics.r2_score(y_test, rounded_lin))

  print("\nY_pred_lin     \t       Y_test\trounded")
  for i, (j, k) in sorted(zip(y_pred_lin, zip(y_test, rounded_lin))):
    print(i , "\t" , j, "\t", k)

Rounded Stratify One Hold Out - Only train set
Linear Regression Accuracy:  0.9047619047619048
Linear Regression R^2 score:  0.8571428571428572

Y_pred_lin     	       Y_test	rounded
-0.13338533425244264 	 0 	 0.0
-0.1083659071882706 	 0 	 0.0
-0.06122542437237788 	 0 	 0.0
-0.05329118074897954 	 0 	 0.0
-0.0429126159997677 	 0 	 0.0
-0.0242791109138423 	 0 	 0.0
0.0009096628014323427 	 0 	 0.0
0.8235121047102323 	 1 	 1.0
0.8780486937147116 	 1 	 1.0
1.161330224826963 	 1 	 1.0
1.1870391393881594 	 1 	 1.0
1.2992859080092798 	 1 	 1.0
1.3299274028519623 	 1 	 1.0
1.3879556985627717 	 1 	 1.0
1.3950051665970737 	 2 	 1.0
1.416967768397909 	 2 	 1.0
1.5535486326267118 	 2 	 2.0
1.6638520161866535 	 2 	 2.0
1.6638520161866535 	 2 	 2.0
1.8198963303604019 	 2 	 2.0
2.0112270682746454 	 2 	 2.0


**Testing**

In [0]:
  Y_pred_lin = linReg.predict(X_test)
  rounded_lin = roundPredict(Y_pred_lin)

In [142]:
print("Rounded Stratify One Hold Out - Test set")
print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(Y_test, rounded_lin))
print("Linear Regression R^2 score: ", metrics.r2_score(Y_test, rounded_lin))

print("\nY_pred_lin     \t      Y_test\trounded")
for i, (j, k) in sorted(zip(Y_pred_lin, zip(Y_test, rounded_lin))):
  print(i , "\t" , j, "\t", k)

Rounded Stratify One Hold Out - Test set
Linear Regression Accuracy:  0.9777777777777777
Linear Regression R^2 score:  0.9666666666666667

Y_pred_lin     	      Y_test	rounded
-0.1451056726255265 	 0 	 0.0
-0.10091460304061714 	 0 	 0.0
-0.09366984691424335 	 0 	 0.0
-0.06334904878487607 	 0 	 0.0
-0.051639603112009214 	 0 	 0.0
-0.04522063580412056 	 0 	 0.0
-0.037150090859261575 	 0 	 0.0
-0.03300609802608376 	 0 	 0.0
-0.029215847235863235 	 0 	 0.0
-0.001534658324381044 	 0 	 0.0
0.0022514364253043984 	 0 	 0.0
0.03221849251171449 	 0 	 0.0
0.03867050514924514 	 0 	 0.0
0.0894168433635038 	 0 	 0.0
0.23727904451950455 	 0 	 0.0
0.8134542366743358 	 1 	 1.0
0.841651410391205 	 1 	 1.0
0.9431482428602571 	 1 	 1.0
1.107129381276551 	 1 	 1.0
1.1287122995934598 	 1 	 1.0
1.1320305036081524 	 1 	 1.0
1.14436334611876 	 1 	 1.0
1.2178650296226965 	 1 	 1.0
1.2233068580498874 	 1 	 1.0
1.2550989944649051 	 1 	 1.0
1.2823039805603245 	 1 	 1.0
1.2941689323359389 	 1 	 1.0
1.33126243981615

**Predicting [6, 3, 5, 1.5]**

In [0]:
Y_pred = linReg.predict([[6, 3, 5, 1.5]])
rounded = roundPredict(Y_pred.copy())

In [144]:
print("Prediction: \t\t",Y_pred)
print("Predicted class: \t", rounded)
print("Mean squared error: \t", metrics.mean_squared_error(rounded, Y_pred))
print("Mean absolute error: \t", metrics.mean_absolute_error(rounded, Y_pred))

Prediction: 		 [1.35496898]
Predicted class: 	 [1.]
Mean squared error: 	 0.1260029785694296
Mean absolute error: 	 0.3549689825455593


**Finding Errors for**

**ROC**\
Using rounded predictions since linear regression has no ROC.

In [145]:
#Prediction of the model
'''fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
    #fpr[i], tpr[i], thresholds = roc_curve(Y_test[:, i], Y_pred_lin[:, i])
    #roc_auc[i] = auc(fpr[i], tpr[i])

for i in range(n_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()'''


'fpr = dict()\ntpr = dict()\nroc_auc = dict()\nfor i in range(3):\n    #fpr[i], tpr[i], thresholds = roc_curve(Y_test[:, i], Y_pred_lin[:, i])\n    #roc_auc[i] = auc(fpr[i], tpr[i])\n\nfor i in range(n_classes):\n    plt.figure()\n    plt.plot(fpr[i], tpr[i], label=\'ROC curve (area = %0.2f)\' % roc_auc[i])\n    plt.plot([0, 1], [0, 1], \'k--\')\n    plt.xlim([0.0, 1.0])\n    plt.ylim([0.0, 1.05])\n    plt.xlabel(\'False Positive Rate\')\n    plt.ylabel(\'True Positive Rate\')\n    plt.title(\'Receiver operating characteristic example\')\n    plt.legend(loc="lower right")\n    plt.show()'