<a href="https://colab.research.google.com/github/nisanuro/CNG562-Assignment-1/blob/master/CNG562_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics, datasets, preprocessing
from sklearn.metrics import roc_curve, auc
%matplotlib inline

In [0]:
def kFold(foldNumber, X_train, Y_train):
  #creating cross validation method with according to foldNumber
  kf = KFold(n_splits=foldNumber, shuffle=False)
  
  #creating both linear & logistic regression models
  logReg = LogisticRegression(solver='liblinear', multi_class='ovr')
  linReg = LinearRegression()
  
  #getting cross validation score according to logistic & linear
  cv_result_log = cross_val_score(logReg, X_train, Y_train, cv=kf, scoring='accuracy')
  cv_result_lin = cross_val_score(linReg, X_train, Y_train, cv=kf, scoring='neg_mean_squared_error')

  #displaying results
  print(str(foldNumber) + "Fold")
  print("Logistic Regression Accuracy: ", cv_result_log.mean())
  print("Linear Regression Accuracy: ", 1 + cv_result_lin.mean())

In [0]:
def randomOneHoldout(X_train, Y_train):
  #splitting dataset as %80 train %20 test
  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)
  
  #creating both linear & logistic regression models
  logReg = LogisticRegression(solver='liblinear', multi_class='ovr')
  linReg = LinearRegression()

  #training the models
  logReg.fit(x_train, y_train)
  linReg.fit(x_train, y_train)

  #predicting values 
  y_pred_log = logReg.predict(x_test)
  y_pred_lin = linReg.predict(x_test)
  
  #displaying results
  print("Random One Hold Out")
  print("Logistic Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_log))
  print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_lin))


In [0]:
def stratifiedOneHoldout(X_train, Y_train):
  #splitting dataset as %80 train %20 test
  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=1, stratify=Y_train)
  
  #creating both linear & logistic regression models
  logReg = LogisticRegression(solver='liblinear', multi_class='ovr')
  linReg = LinearRegression()

  #training the models
  logReg.fit(x_train, y_train)
  linReg.fit(x_train, y_train)

  #predicting values 
  y_pred_log = logReg.predict(x_test)
  y_pred_lin = linReg.predict(x_test)
  
  #displaying results
  print("Stratified")
  print("Logistic Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_log))
  print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, y_pred_lin))

In [0]:
def displayAccuracy(X, Y):
    #splitting dataset as %80 train %20 test
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
    
    #running kFold accuracy method as 5Fold
    kFold(5, X_train, Y_train)
    #running kFold accuracy method as 10Fold
    kFold(10, X_train, Y_train)
    #running random one holdout method
    randomOneHoldout(X_train, Y_train)
    #running strafied one holdout method
    stratifiedOneHoldout(X_train, Y_train)

In [0]:
def roundPredict(p):
    r = p.copy()
    for i in range(len(r)):
        if r[i] <= 0.5: r[i] = 0
        elif r[i] >= 1.5: r[i] = 2
        else: r[i] = 1
    return r

In [0]:
#iris_data.mean(axis=0)
#iris_data.std(axis=0)

In [0]:
#stand_iris_data = preprocessing.scale(iris_data)

In [0]:
#stand_iris_data.mean(axis=0)
#stand_iris_data.std(axis=0)

In [233]:
if __name__ == '__main__':
  #loading raw iris dataset
  iris = datasets.load_iris()
  #loading raw iris data from dataset
  X = iris.data
  #loading iris titles from dataset 
  Y = iris.target
  
  #normalize raw data using L1 normalization technique
  l1_norm = preprocessing.normalize(X, norm="l1")
  #normalize raw data using mean removal technique
  mean_removal = preprocessing.scale(X)
  
  #mean & standart deviation before mean removal 
  print(X.mean(axis=0))
  print(X.std(axis=0))

  #mean & standart deviation after mean removal 
  print(mean_removal.mean(axis=0))
  print(mean_removal.std(axis=0))

  #Displaying result according to each type of methods and regression model
  print("\nRaw: ")
  displayAccuracy(X,Y)
  print("\nL1 Normalization: ")
  displayAccuracy(l1_norm,Y)
  print("\nMean Removal: ")
  displayAccuracy(mean_removal,Y)



[5.84333333 3.05733333 3.758      1.19933333]
[0.82530129 0.43441097 1.75940407 0.75969263]
[-1.69031455e-15 -1.84297022e-15 -1.69864123e-15 -1.40924309e-15]
[1. 1. 1. 1.]

Raw: 
5Fold
Logistic Regression Accuracy:  0.9416666666666668
Linear Regression Accuracy:  0.9505573483173648
10Fold
Logistic Regression Accuracy:  0.9333333333333332
Linear Regression Accuracy:  0.9504757453551504
Random One Hold Out
Logistic Regression Accuracy:  0.9583333333333334
Linear Regression Accuracy:  0.9596940114241354
Stratified
Logistic Regression Accuracy:  0.9583333333333334
Linear Regression Accuracy:  0.9613449341091412

L1 Normalization: 
5Fold
Logistic Regression Accuracy:  0.6916666666666667
Linear Regression Accuracy:  0.9249382972717285
10Fold
Logistic Regression Accuracy:  0.6916666666666667
Linear Regression Accuracy:  0.9228036562601725
Random One Hold Out
Logistic Regression Accuracy:  0.75
Linear Regression Accuracy:  0.9311930338541666
Stratified
Logistic Regression Accuracy:  0.70833333

In [0]:
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1, stratify=Y)
 
  linReg = LinearRegression()
  logReg = LogisticRegression(solver='liblinear', multi_class='ovr')
  
  linReg.fit(X_train, Y_train)
  logReg.fit(X_train, Y_train)
  #[[6, 3, 5, 1.5]]
  Y_pred_lin = linReg.predict(X_test)
  #Y_pred_log = logReg.predict([[5.8, 4. , 1.2, 0.2]])

In [0]:
rounded = roundPredict(Y_pred_lin)

In [236]:
print("Y_pred_lin     \t      Y_test\trounded")
for i, (j, k) in sorted(zip(Y_pred_lin, zip(Y_test, rounded))):
  print(i , "\t" , j, "\t", k)


Y_pred_lin     	      Y_test	rounded
-0.12929078285095597 	 0 	 0.0
-0.07564877833579955 	 0 	 0.0
-0.06637327075692942 	 0 	 0.0
-0.06559310558079762 	 0 	 0.0
-0.03613839155352583 	 0 	 0.0
-0.03256604997617557 	 0 	 0.0
-0.01928245835782255 	 0 	 0.0
-0.017406169009531847 	 0 	 0.0
-0.014015034818435357 	 0 	 0.0
-0.008446620426688894 	 0 	 0.0
-0.0036763632695827053 	 0 	 0.0
0.007758432451428959 	 0 	 0.0
0.01806864028230959 	 0 	 0.0
0.06901382278559495 	 0 	 0.0
0.20549801076730095 	 0 	 0.0
0.8510624563461295 	 1 	 1.0
0.8628810720015663 	 1 	 1.0
1.0006874555133378 	 1 	 1.0
1.1202817798301417 	 1 	 1.0
1.1655292630831342 	 1 	 1.0
1.1721089189171559 	 1 	 1.0
1.1746180965245236 	 1 	 1.0
1.2210814874157805 	 1 	 1.0
1.2311710906399813 	 1 	 1.0
1.266328970668773 	 1 	 1.0
1.3220728740471366 	 1 	 1.0
1.323294248436628 	 1 	 1.0
1.346446838672502 	 1 	 1.0
1.3827632370606238 	 1 	 1.0
1.383481978316607 	 1 	 1.0
1.490770021988134 	 2 	 1.0
1.5658740312613317 	 2 	 2.0
1.607362

In [237]:
1 -metrics.mean_squared_error(Y_test, rounded)
#Y_pred_lin

0.9777777777777777

In [0]:
Y_pred_lin = linReg.predict([[6, 3, 5, 1.5]])
rounded = roundPredict(Y_pred_lin.copy())

In [239]:
rounded

array([1.])

Rounded with train set

In [240]:
  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.3, random_state=1, stratify=Y_train)

  linReg = LinearRegression()

  linReg.fit(x_train, y_train)

  y_pred_lin = linReg.predict(x_test)
  round_lin = roundPredict(y_pred_lin)

  print("Rounded Stratify One Hold Out - Only train set")
  print("Linear Regression Accuracy: ", 1 - metrics.mean_squared_error(y_test, round_lin))

  print("\nY_pred_lin     \t       Y_test\trounded")
  for i, (j, k) in sorted(zip(y_pred_lin, zip(y_test, round_lin))):
    print(i , "\t" , j, "\t", k)

Rounded Stratify One Hold Out - Only train set
Linear Regression Accuracy:  0.90625

Y_pred_lin     	       Y_test	rounded
-0.12003692589839096 	 0 	 0.0
-0.09876468257733288 	 0 	 0.0
-0.06761673273426298 	 0 	 0.0
-0.06605430711304593 	 0 	 0.0
-0.05671651751464296 	 0 	 0.0
-0.04956319810682974 	 0 	 0.0
-0.045800804316914134 	 0 	 0.0
-0.03563775726864826 	 0 	 0.0
-0.026284469692136492 	 0 	 0.0
0.020085272234868445 	 0 	 0.0
0.8068694083069459 	 1 	 1.0
0.8110913595473883 	 1 	 1.0
0.8530611651369964 	 1 	 1.0
1.1169369554310995 	 1 	 1.0
1.153137882137157 	 1 	 1.0
1.1636723973574408 	 1 	 1.0
1.2493091259536646 	 1 	 1.0
1.2887540977198382 	 1 	 1.0
1.3012167385606752 	 1 	 1.0
1.3419522933999206 	 1 	 1.0
1.3547340317228116 	 1 	 1.0
1.3769156601394965 	 2 	 1.0
1.3867130071884262 	 2 	 1.0
1.3974940285903639 	 2 	 1.0
1.5242320120251693 	 2 	 2.0
1.5365828384801437 	 2 	 2.0
1.6640955152147843 	 2 	 2.0
1.6640955152147843 	 2 	 2.0
1.7749921209528452 	 2 	 2.0
1.8063213651089

# Finding Threshold using ROC

In [241]:
#Prediction of the model
'''fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
    #fpr[i], tpr[i], thresholds = roc_curve(Y_test[:, i], Y_pred_lin[:, i])
    #roc_auc[i] = auc(fpr[i], tpr[i])

for i in range(n_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()'''


'fpr = dict()\ntpr = dict()\nroc_auc = dict()\nfor i in range(3):\n    #fpr[i], tpr[i], thresholds = roc_curve(Y_test[:, i], Y_pred_lin[:, i])\n    #roc_auc[i] = auc(fpr[i], tpr[i])\n\nfor i in range(n_classes):\n    plt.figure()\n    plt.plot(fpr[i], tpr[i], label=\'ROC curve (area = %0.2f)\' % roc_auc[i])\n    plt.plot([0, 1], [0, 1], \'k--\')\n    plt.xlim([0.0, 1.0])\n    plt.ylim([0.0, 1.05])\n    plt.xlabel(\'False Positive Rate\')\n    plt.ylabel(\'True Positive Rate\')\n    plt.title(\'Receiver operating characteristic example\')\n    plt.legend(loc="lower right")\n    plt.show()'