<a href="https://colab.research.google.com/github/mion158/data-mining-assignments/blob/main/evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [9]:
iris = pd.read_csv('https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv')

iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [10]:
X = iris.drop(['variety'], axis =1 )
y = iris['variety']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

In [29]:
#dummy classifier used as baseline for comparision against actual classifiers
from sklearn.dummy import DummyClassifier
#most frequent strategy
dummy_mostfreq = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
predictions = dummy_mostfreq.predict(X_test)

print(predictions)
print("")
print(dummy_mostfreq.score(X_test,y_test))
print(confusion_matrix(y_test, dummy_mostfreq.predict(X_test)))

['Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica'
 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica'
 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica'
 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica'
 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica'
 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica' 'Virginica'
 'Virginica' 'Virginica']

0.23684210526315788
[[ 0  0 13]
 [ 0  0 16]
 [ 0  0  9]]


In [27]:
#stratified strategy produce random predictions with same class proportion as training set
dummy_proportion = DummyClassifier(strategy = 'stratified').fit(X_train, y_train)
predictions = dummy_proportion.predict(X_test)

print(predictions)
print("")
print(dummy_proportion.score(X_test,y_test))
print("")
print(confusion_matrix(y_test, dummy_proportion.predict(X_test)))

['Virginica' 'Virginica' 'Virginica' 'Setosa' 'Setosa' 'Versicolor'
 'Setosa' 'Setosa' 'Versicolor' 'Virginica' 'Versicolor' 'Setosa' 'Setosa'
 'Versicolor' 'Setosa' 'Setosa' 'Setosa' 'Setosa' 'Setosa' 'Versicolor'
 'Setosa' 'Versicolor' 'Setosa' 'Virginica' 'Setosa' 'Setosa' 'Setosa'
 'Virginica' 'Virginica' 'Setosa' 'Virginica' 'Virginica' 'Virginica'
 'Versicolor' 'Versicolor' 'Setosa' 'Setosa' 'Virginica']

0.39473684210526316

[[4 2 7]
 [5 2 9]
 [2 2 5]]


In [32]:
#try support vector machine
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=1).fit(X_train,y_train)
print(svm.score(X_test,y_test))
print("confusion matrix:")
print(confusion_matrix(y_test, svm.predict(X_test)))
print(classification_report(y_test, svm.predict(X_test)))


0.9736842105263158
confusion matrix:
[[13  0  0]
 [ 0 15  1]
 [ 0  0  9]]
              precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00        13
  Versicolor       1.00      0.94      0.97        16
   Virginica       0.90      1.00      0.95         9

    accuracy                           0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38



In [43]:
#model selection using cross validation and grid search 

from sklearn.model_selection import cross_val_score, GridSearchCV

print('cross-validation accuracy ', cross_val_score(svm, X,y, cv=5))

print('-----------Grid search-------------')
grid_values = {'gamma': [0.001,0.01,0.1,1,10,100]}
grid_clf_acc = GridSearchCV(svm, param_grid = grid_values)
grid_clf_acc.fit(X_train,y_train)
score_acc = grid_clf_acc.decision_function(X_test)
print('Grid best params ', grid_clf_acc.best_params_)
print('Grid best score ', grid_clf_acc.best_score_)

cross-validation accuracy  [0.96666667 0.96666667 0.96666667 0.93333333 1.        ]
-----------Grid search-------------
Grid best params  {'gamma': 0.1}
Grid best score  0.9640316205533598


In [34]:
#try logistic regression
from sklearn.linear_model import LogisticRegression
#needed to increase max iteration
logreg = LogisticRegression(max_iter=200).fit(X_train,y_train)
print(logreg.score(X_test,y_test))
print("confusion matrix:")
print(confusion_matrix(y_test, logreg.predict(X_test)))
print(classification_report(y_test, logreg.predict(X_test)))

0.9736842105263158
confusion matrix:
[[13  0  0]
 [ 0 15  1]
 [ 0  0  9]]
              precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00        13
  Versicolor       1.00      0.94      0.97        16
   Virginica       0.90      1.00      0.95         9

    accuracy                           0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38



In [35]:
#try decision tree
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=3).fit(X_train,y_train)
print(tree.score(X_test,y_test))
print("confusion matrix:")
print(confusion_matrix(y_test, tree.predict(X_test)))
print(classification_report(y_test, tree.predict(X_test)))

0.9736842105263158
confusion matrix:
[[13  0  0]
 [ 0 15  1]
 [ 0  0  9]]
              precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00        13
  Versicolor       1.00      0.94      0.97        16
   Virginica       0.90      1.00      0.95         9

    accuracy                           0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38

