In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("iphone_dataset.csv")
df

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0
...,...,...,...,...
395,Female,46,41000,1
396,Male,51,23000,1
397,Female,50,20000,1
398,Male,36,33000,0


In [3]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, 3].values

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [5]:
#dealing with the categorical datatype of Column Gender
gender_Encoder = LabelEncoder()
X[:, 0] = gender_Encoder.fit_transform(X[:, 0])

In [6]:
#scaling the dataset
sc = StandardScaler()
X = sc.fit_transform(X)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [8]:
# Instantiating the classifiers
knn_model = KNeighborsClassifier(n_neighbors=5, p=2)
log_model = LogisticRegression(solver="liblinear", max_iter=10000)
linearSVC_model = LinearSVC(random_state=1337)
kernel_svc_model = SVC()
dtr_model = DecisionTreeClassifier(criterion="entropy")
gnb_model = GaussianNB()

In [9]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted'),
}

In [10]:
features = X
target = Y
folds = KFold(n_splits=10, random_state=1337, shuffle=True)

# Performing cross-validation on each classifier
kNeighbors_result = cross_validate(knn_model,
                                   features,
                                   target,
                                   cv=folds,
                                   scoring=scoring)
linearSVC_result = cross_validate(linearSVC_model,
                                  features,
                                  target,
                                  cv=folds,
                                  scoring=scoring)
SVC_result = cross_validate(kernel_svc_model,
                            features,
                            target,
                            cv=folds,
                            scoring=scoring)
gaussianNB_result = cross_validate(gnb_model,
                                   features,
                                   target,
                                   cv=folds,
                                   scoring=scoring)
logisticRegression_result = cross_validate(log_model,
                                           features,
                                           target,
                                           cv=folds,
                                           scoring=scoring)
decisionTreeClassifier_result = cross_validate(dtr_model,
                                               features,
                                               target,
                                               cv=folds,
                                               scoring=scoring)


# Create a data frame with the models perfoamnce metrics scores
models_scores_table = pd.DataFrame(
    {
        'KNN': [
            kNeighbors_result['test_accuracy'].mean(),
            kNeighbors_result['test_precision'].mean(),
            kNeighbors_result['test_f1_score'].mean()
        ],
        'Linear SVC': [
            linearSVC_result['test_accuracy'].mean(),
            linearSVC_result['test_precision'].mean(),
            linearSVC_result['test_f1_score'].mean()
        ],
        'Kernel SVC': [
            SVC_result['test_accuracy'].mean(),
            SVC_result['test_precision'].mean(),
            SVC_result['test_f1_score'].mean()
        ],
        'Gaussian NB': [
            gaussianNB_result['test_accuracy'].mean(),
            gaussianNB_result['test_precision'].mean(),
            gaussianNB_result['test_f1_score'].mean()
        ],
        'Logistic Regression': [
            logisticRegression_result['test_accuracy'].mean(),
            logisticRegression_result['test_precision'].mean(),
            logisticRegression_result['test_f1_score'].mean()
        ],
        'Decision Tree': [
            decisionTreeClassifier_result['test_accuracy'].mean(),
            decisionTreeClassifier_result['test_precision'].mean(),
            decisionTreeClassifier_result['test_f1_score'].mean()
        ]
    },
    index=[
        'Accuracy',
        'Precision',
        'F1 Score',
    ])

In [11]:
models_scores_table

Unnamed: 0,KNN,Linear SVC,Kernel SVC,Gaussian NB,Logistic Regression,Decision Tree
Accuracy,0.9125,0.8375,0.905,0.8825,0.8475,0.8675
Precision,0.917836,0.841972,0.91049,0.88799,0.849491,0.872998
F1 Score,0.912728,0.832183,0.905961,0.881607,0.844321,0.86794


In [12]:
models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)

In [13]:
models_scores_table

Unnamed: 0,KNN,Linear SVC,Kernel SVC,Gaussian NB,Logistic Regression,Decision Tree,Best Score
Accuracy,0.9125,0.8375,0.905,0.8825,0.8475,0.8675,KNN
Precision,0.917836,0.841972,0.91049,0.88799,0.849491,0.872998,KNN
F1 Score,0.912728,0.832183,0.905961,0.881607,0.844321,0.86794,KNN


In [14]:
#Results - KNN is the best performing classifier on all 3 metrics