<a href="https://colab.research.google.com/github/negar67/Python/blob/master/MultiClassClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#load a dataset and train two models to perform multiclass classification.
#compare the results of the models
#Dataset is the "digits" dataset from sklearn's datasets library.
#Goal: to identify digits from 0 to 9 correctly

#importing all necessary libraries and dataset
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits #import load_digits function from the sklearn's datasets library
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [0]:
#load the dataset
X, y = load_digits(return_X_y=True) #invokes load_digits with setting the return_X_y parameter to True and store the returned datasets in X, y variables

In [26]:
#performing Exploratory Data Analysis
#displaying the number of rows and features in the dataset and checking if it is a balanced dataset (having almost same number of samples in each class)
X.shape

(1797, 64)

In [29]:
#we see that there are 10 classes with ~180 samples in each one so it is a balanced dataset
np.bincount(y)

array([178, 182, 177, 183, 181, 182, 181, 179, 174, 180])

In [0]:
#split the dataset into training and testing data
#test data proportion is 20%
#setting the random_state to make the results repeatable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 25)

In [39]:
#create a LogisticRegression classifier and use 5-fold cross validation to train the model
#set the solver to 'lbfgs' and multiclass strategy to 'ovr' in LogisticRegression 
lr_clf = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=1000)
lr_cv_scores = cross_val_score(lr_clf, X, y, cv=5)

print('Accuracy scores for the 5 folds: ', lr_cv_scores)
print('Mean cross validation score: {:.3f}'.format(lr_cv_scores.mean()))

Accuracy scores for the 5 folds:  [0.90833333 0.87777778 0.94428969 0.9637883  0.8913649 ]
Mean cross validation score: 0.917


In [40]:
#This time create a RandomForest classifier and use 5-fold cross validation to train the model
rf_clf = RandomForestClassifier(n_estimators = 24)
rf_cv_scores = cross_val_score(rf_clf, X, y, cv=5)

print('Accuracy scores for the 5 folds: ', rf_cv_scores)
print('Mean cross validation score: {:.3f}'.format(rf_cv_scores.mean()))

Accuracy scores for the 5 folds:  [0.93333333 0.89166667 0.95821727 0.95821727 0.91086351]
Mean cross validation score: 0.930
