<a href="https://colab.research.google.com/github/nisanuro/CNG562-Assignment-1/blob/master/CNG562_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CNG 562 - Assignment #1**

Linear Regression vs Logistic Regression using Iris Dataset\
Comparing:
*   Random 1-Hold Out
*   5-Fold
*   10-Fold
*   Strafied 1-Hold Out

\
Nisa Nur Odabaş\
Kaan Taha Köken

---



In [0]:
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics, datasets, preprocessing
from sklearn.metrics import roc_curve, auc
%matplotlib inline

**K-Fold method**

In [0]:
def kFold(foldNumber, X_train, Y_train):

  kf = KFold(n_splits=foldNumber, shuffle=False)  

  logReg = LogisticRegression(solver='liblinear', multi_class='ovr')
  linReg = LinearRegression()  

  cv_result_log = cross_val_score(logReg, X_train, Y_train, cv=kf, scoring='accuracy')
  cv_result_lin = cross_val_score(linReg, X_train, Y_train, cv=kf, scoring='neg_mean_squared_error')

  print(str(foldNumber) + "Fold")
  print("Logistic Regression Accuracy: ", cv_result_log.mean())
  print("Linear Regression Accuracy: ", 1 + cv_result_lin.mean())

**Random 1-Hold Out method**

In [0]:
def randomOneHoldout(X_train, Y_train):

  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)
  
  logReg = LogisticRegression(solver='liblinear', multi_class='ovr')
  linReg = LinearRegression()

  logReg.fit(x_train, y_train)
  linReg.fit(x_train, y_train)

  y_pred_log = logReg.predict(x_test)
  y_pred_lin = linReg.predict(x_test)
  
  print("Random One Hold Out")
  print("Logistic Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_log))
  print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_lin))

**Stratified 1-Hold Out method**

In [0]:
def stratifiedOneHoldout(X_train, Y_train):
  
  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=1, stratify=Y_train)
  
  logReg = LogisticRegression(solver='liblinear', multi_class='ovr')
  linReg = LinearRegression()

  logReg.fit(x_train, y_train)
  linReg.fit(x_train, y_train)

  y_pred_log = logReg.predict(x_test)
  y_pred_lin = linReg.predict(x_test)
  
  print("Stratified")
  print("Logistic Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_log))
  print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_lin))

**Displaying accuracies for all validation methods**

In [0]:
def displayAccuracy(X, Y):
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
    
    kFold(5, X_train, Y_train)    
    kFold(10, X_train, Y_train)    
    randomOneHoldout(X_train, Y_train)
    stratifiedOneHoldout(X_train, Y_train)

**Round method for linear regression prediction**

In [0]:
def roundPredict(p):
    r = p.copy()
    for i in range(len(r)):
        if r[i] <= 0.5: r[i] = 0
        elif r[i] >= 1.5: r[i] = 2
        else: r[i] = 1
    return r

**Main**

In [0]:
if __name__ == '__main__':

  iris = datasets.load_iris()
  
  X = iris.data
  Y = iris.target
  
  # L1 normalization
  l1_norm = preprocessing.normalize(X, norm="l1")
  # Mean removal
  mean_removal = preprocessing.scale(X)

  '''#mean & standart deviation before mean removal 
  print(X.mean(axis=0))
  print(X.std(axis=0))

  #mean & standart deviation after mean removal 
  print(mean_removal.mean(axis=0))
  print(mean_removal.std(axis=0))'''

  #Displaying result according to each type of methods and regression model
  print("\nRaw: ")
  displayAccuracy(X,Y)
  print("\nL1 Normalization: ")
  displayAccuracy(l1_norm,Y)
  print("\nMean Removal: ")
  displayAccuracy(mean_removal,Y)


Raw: 
5Fold
Logistic Regression Accuracy:  0.9416666666666668
Linear Regression Accuracy:  0.9505573483173648
10Fold
Logistic Regression Accuracy:  0.9333333333333332
Linear Regression Accuracy:  0.9504757453551504
Random One Hold Out
Logistic Regression Accuracy:  0.9583333333333334
Linear Regression Accuracy:  0.9596940114241354
Stratified
Logistic Regression Accuracy:  0.9583333333333334
Linear Regression Accuracy:  0.9613449341091412

L1 Normalization: 
5Fold
Logistic Regression Accuracy:  0.6916666666666667
Linear Regression Accuracy:  0.9249382972717285
10Fold
Logistic Regression Accuracy:  0.6916666666666667
Linear Regression Accuracy:  0.9228036562601725
Random One Hold Out
Logistic Regression Accuracy:  0.75
Linear Regression Accuracy:  0.9311930338541666
Stratified
Logistic Regression Accuracy:  0.7083333333333333
Linear Regression Accuracy:  0.9382756551106771

Mean Removal: 
5Fold
Logistic Regression Accuracy:  0.8916666666666668
Linear Regression Accuracy:  0.950557348317

# **Final**
**Training and Testing using:**
* **Raw data**
* **Stratified 1-Hold Out**
* **Linear Regression**






Dividing Train and Test sets

In [0]:
 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1, stratify=Y)

Training

In [0]:
  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=1, stratify=Y_train)

  linReg = LinearRegression()
  linReg.fit(x_train, y_train)

  y_pred_lin = linReg.predict(x_test)
  rounded_lin = roundPredict(y_pred_lin)

  print("Rounded Stratify One Hold Out - Only train set")
  print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, rounded_lin))

  print("\nY_pred_lin     \t       Y_test\trounded")
  for i, (j, k) in sorted(zip(y_pred_lin, zip(y_test, rounded_lin))):
    print(i , "\t" , j, "\t", k)

Rounded Stratify One Hold Out - Only train set
Linear Regression Accuracy:  0.90625

Y_pred_lin     	       Y_test	rounded
-0.12003692589839096 	 0 	 0.0
-0.09876468257733288 	 0 	 0.0
-0.06761673273426298 	 0 	 0.0
-0.06605430711304593 	 0 	 0.0
-0.05671651751464296 	 0 	 0.0
-0.04956319810682974 	 0 	 0.0
-0.045800804316914134 	 0 	 0.0
-0.03563775726864826 	 0 	 0.0
-0.026284469692136492 	 0 	 0.0
0.020085272234868445 	 0 	 0.0
0.8068694083069459 	 1 	 1.0
0.8110913595473883 	 1 	 1.0
0.8530611651369964 	 1 	 1.0
1.1169369554310995 	 1 	 1.0
1.153137882137157 	 1 	 1.0
1.1636723973574408 	 1 	 1.0
1.2493091259536646 	 1 	 1.0
1.2887540977198382 	 1 	 1.0
1.3012167385606752 	 1 	 1.0
1.3419522933999206 	 1 	 1.0
1.3547340317228116 	 1 	 1.0
1.3769156601394965 	 2 	 1.0
1.3867130071884262 	 2 	 1.0
1.3974940285903639 	 2 	 1.0
1.5242320120251693 	 2 	 2.0
1.5365828384801437 	 2 	 2.0
1.6640955152147843 	 2 	 2.0
1.6640955152147843 	 2 	 2.0
1.7749921209528452 	 2 	 2.0
1.8063213651089

Testing

In [0]:
  Y_pred_lin = linReg.predict(X_test)
  rounded_lin = roundPredict(Y_pred_lin)

In [0]:
print("Rounded Stratify One Hold Out - Test set")
print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(Y_test, rounded_lin))


print("\nY_pred_lin     \t      Y_test\trounded")
for i, (j, k) in sorted(zip(Y_pred_lin, zip(Y_test, rounded_lin))):
  print(i , "\t" , j, "\t", k)


Rounded Stratify One Hold Out - Test set
Linear Regression Accuracy:  0.9777777777777777

Y_pred_lin     	      Y_test	rounded
-0.13363750108183664 	 0 	 0.0
-0.11348362390957749 	 0 	 0.0
-0.1034927937855843 	 0 	 0.0
-0.06798243273359908 	 0 	 0.0
-0.04819425556067608 	 0 	 0.0
-0.046997529938795135 	 0 	 0.0
-0.04635998739131367 	 0 	 0.0
-0.03243454655313216 	 0 	 0.0
-0.026378327143327246 	 0 	 0.0
-0.019784190809913532 	 0 	 0.0
0.01563231279088087 	 0 	 0.0
0.032817296667287554 	 0 	 0.0
0.04143918424512705 	 0 	 0.0
0.09476876250488186 	 0 	 0.0
0.23697911442097686 	 0 	 0.0
0.8065037083076098 	 1 	 1.0
0.8198324409429103 	 1 	 1.0
0.8925591600097604 	 1 	 1.0
1.0759703923883095 	 1 	 1.0
1.1060212422333708 	 1 	 1.0
1.1118056190950303 	 1 	 1.0
1.1227058343146505 	 1 	 1.0
1.1815887812325196 	 1 	 1.0
1.187085817567925 	 1 	 1.0
1.2174240079392407 	 1 	 1.0
1.2497686834041914 	 1 	 1.0
1.2943449915064345 	 1 	 1.0
1.296085402224606 	 1 	 1.0
1.304142338555364 	 1 	 1.0
1.36816

Predicting [6, 3, 5, 1.5]

In [0]:
Y_pred = linReg.predict([[6, 3, 5, 1.5]])
rounded = roundPredict(Y_pred.copy())

In [0]:
rounded

array([1.])

Finding Threshold using ROC

In [0]:
#Prediction of the model
'''fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
    #fpr[i], tpr[i], thresholds = roc_curve(Y_test[:, i], Y_pred_lin[:, i])
    #roc_auc[i] = auc(fpr[i], tpr[i])

for i in range(n_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()'''


'fpr = dict()\ntpr = dict()\nroc_auc = dict()\nfor i in range(3):\n    #fpr[i], tpr[i], thresholds = roc_curve(Y_test[:, i], Y_pred_lin[:, i])\n    #roc_auc[i] = auc(fpr[i], tpr[i])\n\nfor i in range(n_classes):\n    plt.figure()\n    plt.plot(fpr[i], tpr[i], label=\'ROC curve (area = %0.2f)\' % roc_auc[i])\n    plt.plot([0, 1], [0, 1], \'k--\')\n    plt.xlim([0.0, 1.0])\n    plt.ylim([0.0, 1.05])\n    plt.xlabel(\'False Positive Rate\')\n    plt.ylabel(\'True Positive Rate\')\n    plt.title(\'Receiver operating characteristic example\')\n    plt.legend(loc="lower right")\n    plt.show()'