# Machine Learning Classifiers (scikit-learn)

## Authors : Rafsanjani Muhammod & Yeazullah Aziz

### Avoiding warning

In [1]:
# Avoiding warning
import warnings
def warn(*args, **kwargs): pass
warnings.warn = warn
# _______________________________

### Importing essential library

In [3]:
# Essential Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# _____________________________

### Loading the dataset

In [14]:
# Load dataset
D = pd.read_csv('/home/rafsanjani/Desktop/monodata.csv', header=None)
D = D.drop_duplicates() # Return : each row are unique value
#   print(D.shape) # Return : row, column
# ____________________________________________________________________

print(D.shape)
print()
print(D)

(278, 21)

             0          1          2          3          4          5   \
0     31.878947   0.436414  14.864925  42.252833  10.276103   9.144008   
1     12.773493   0.857822   6.595729   5.183923   4.495137   7.829791   
2     29.506459   1.377185  19.167023  17.223346  17.435778  29.941994   
3     45.474052   2.171144  27.655692  22.063534  13.254630  35.628275   
4     27.059732   5.266138  24.234508  12.049606  13.223784  25.662224   
5     26.648576   7.201495  19.412428  22.140922  10.964234  27.467605   
6     29.119535   4.772717  18.706092   9.363417  10.772448  51.772431   
7     90.500000   2.000000  34.500000  42.000000  27.500000  46.500000   
8     16.419771   0.923602  10.350049  11.182768   5.879438  13.911936   
9      8.748706   0.313611   3.006630   4.761503   1.893849   4.508689   
10    29.974180   3.232539  19.795971  18.808486  11.644253  21.940464   
11    55.917130   2.164977  27.786322  30.427384  19.944195  46.601618   
12     9.516898   0.200129 

### Divide features ( X ) and classes ( y )

In [16]:
# Divide features (X) and classes (y)
X = D.iloc[:,0:20].values
y = D.iloc[:,20].values
# ____________________________________

print(X)
print()
print(y)

[[ 31.87894693   0.43641389  14.86492535 ...,  23.1241993    0.84182345
    4.19332   ]
 [ 12.77349297   0.85782219   6.59572893 ...,  15.53990602   0.80214332
    3.27665617]
 [ 29.50645867   1.37718472  19.16702346 ...,  21.79988149   2.23836279
   11.93772103]
 ..., 
 [  8.09076416   0.33332467   6.81628278 ...,   6.64677746   0.38156841
    2.35513374]
 [  4.71263264   1.4001818    0.82208071 ...,   4.91978346   2.30460257
    1.52740013]
 [  6.31705298   5.01289044   2.80333981 ...,   4.57157784   0.63311594
    2.98575905]]

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1 

### Handling the missing values with "mean"

In [19]:
# Handle the missing values with "mean"
from sklearn.preprocessing import Imputer
X[:, [0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]] = \
    Imputer(strategy='mean').fit_transform(X[:, [0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19]])
# __________________________________________________________________________________________________

# You can use another stategy = 'median' & stategy = 'most_frequent'

### Spliting the dataset

In [21]:
# Spliting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, train_size=0.75, random_state=0)
# __________________________________________________________


### Features scalling

In [23]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.fit_transform(X_test)
# _____________________________________________________________


### Machine Learning Classifiers

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, \
                             RandomForestClassifier,\
                             AdaBoostClassifier,\
                             GradientBoostingClassifier
            
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier

# _______________________________________________________________________


### Evaluation Matrix

In [27]:
from sklearn.metrics import accuracy_score, \
                            log_loss, \
                            classification_report, \
                            confusion_matrix

# ____________________________________________________________________________________________________________

from pandas_ml import ConfusionMatrix   # I'm using 'pandas_ml' for better confusion matrix than 'scikit-learn'.


### Classifiers

In [28]:
classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(n_neighbors=5),
    DecisionTreeClassifier(),
    SVC(kernel='rbf', probability=True),
    GaussianNB(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    XGBClassifier()
]

### Run all classifiers

In [29]:
for classifier in classifiers:

    model = classifier.fit(X_train, y_train)

    y_artificial = model.predict(X_test) # Predicted

    name = classifier.__class__.__name__

    TN, FP, FN, TP = confusion_matrix(y_true=y_test, y_pred=y_artificial).ravel()

    print('_' * 43)
    print('Classifier : {}'.format(name))
    print('Accuracy : {0:.3f} %'.format(accuracy_score(y_true=y_test, y_pred=y_artificial)*100.0))
    # print('_'*40)

    print()
    print('Confusion Matrix :')
    CM = ConfusionMatrix(y_true = y_test, y_pred = y_artificial)
    print(CM)
    print()

    print('TN = {}'.format(TN))
    print('FP = {}'.format(FP))
    print('FN = {}'.format(FN))
    print('TP = {}'.format(TP))
    print()

    # CM.print_stats() # For Statistics based-on confusion matrix.

# _______________________________________________________________________________________________________

___________________________________________
Classifier : LogisticRegression
Accuracy : 71.429 %

Confusion Matrix :
Predicted  -1   1  __all__
Actual                    
-1         22   9       31
1          11  28       39
__all__    33  37       70

TN = 22
FP = 9
FN = 11
TP = 28

___________________________________________
Classifier : KNeighborsClassifier
Accuracy : 65.714 %

Confusion Matrix :
Predicted  -1   1  __all__
Actual                    
-1         20  11       31
1          13  26       39
__all__    33  37       70

TN = 20
FP = 11
FN = 13
TP = 26

___________________________________________
Classifier : DecisionTreeClassifier
Accuracy : 64.286 %

Confusion Matrix :
Predicted  -1   1  __all__
Actual                    
-1         18  13       31
1          12  27       39
__all__    30  40       70

TN = 18
FP = 13
FN = 12
TP = 27

___________________________________________
Classifier : SVC
Accuracy : 70.000 %

Confusion Matrix :
Predicted  -1   1  __all__
Actual      