In [None]:
import numpy as np
import pandas as pd

from pandas.plotting import scatter_matrix
import seaborn as sns
#sns.set_style("whitegrid")
import matplotlib.pylab as plt
%matplotlib inline 

# Data Processing

In [None]:
cluster = pd.read_csv('flowers.csv',
                     names = ["sepal length", "sepal width","petal length", "petal width","Class"])

In [None]:
cluster.head()

In [None]:
print('Rows:', cluster.shape[0], ' Colomns: ', cluster.shape[1])

In [None]:
cluster.info()

In [None]:
#Checking for null values
cluster.isnull().sum()

In [None]:
#boxplots before cleaning data
cluster.boxplot(color = 'red')

In [None]:
#remove outliers
Q1 = cluster.quantile(0.25)
Q3 = cluster.quantile(0.75)
IQR = Q3 - Q1

cluster = cluster[~((cluster < (Q1 - 2.5 * IQR)) |(cluster > (Q3 + 2.5 * IQR))).any(axis=1)]
cluster

In [None]:
#boxplots After cleaning data
cluster.boxplot(color = 'blue')

In [None]:
print('Rows:', cluster.shape[0], ' Colomns: ', cluster.shape[1])

In [None]:
#discriptive statistics
cluster.describe(include='all'

In [None]:
#check if class values distributon
cluster['Class'].value_counts()

In [None]:
sns.countplot(cluster['Class'])

# Visualising the data

# Univariate plots

In [None]:
#boxplot
col_names = cluster.columns

cluster[col_names].plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.show()

In [None]:
#histogram
cluster[col_names].hist()
plt.xlabel('Class')

# Multivariant plots

In [None]:
sns.pairplot(cluster, hue='Class', diag_kind='hist')

# Split the dataset

In [None]:
X = cluster[["sepal length", "sepal width","petal length", "petal width"]]
y = cluster['Class']

In [None]:
# Split the data to train and test dataset.
from sklearn.model_selection import train_test_split,KFold,cross_val_score

xtrain,xtest, ytrain, ytest  = train_test_split(X, y, test_size=0.2)

# Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
np.random.seed(1000)
# initialize kfold for cross-validation
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
    
# making a list of ml classification models
models = []

def classification_Models(xtrain,xtest, ytrain, ytest ):
    models.append( ('LR',  LogisticRegression()) )
    models.append( ('CART',DecisionTreeClassifier()) )
    models.append( ('KNN', KNeighborsClassifier()) )
    models.append( ('SVM',  SVC()) )

    modeloutcomes = []
    modelnames = []
    for name,model in models:
        v_results = cross_val_score(model, xtrain, ytrain, cv = k_fold, 
                                     scoring='accuracy', n_jobs = -1, verbose = 0)
        print(name,v_results.mean())
        modeloutcomes.append(v_results)
        modelnames.append(name)
        
    print(modeloutcomes)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_xticklabels(modelnames)
    plt.boxplot(modeloutcomes)
        
classification_Models(xtrain,xtest, ytrain, ytest)

# Evaluating and Predicting Models

In [None]:
for name,model in models:
    trainedmodel = model.fit(xtrain,ytrain)
    
    # prediction
    ypredict = trainedmodel.predict(xtest)
    
    acc = accuracy_score(ytest,ypredict)
    classreport = classification_report(ytest,ypredict)
    confMat = confusion_matrix(ytest,ypredict)
    
    print('\n****************************'+name)
    print('The accuracy: {}'.format(acc))
    print('The Confusion Matrix:\n {}'.format(confMat))
    print('The Classification Report:\n {}'.format(classreport))