In [97]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pylab as plt
#%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

from scipy.stats import norm
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score

#DS
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#KNN
from sklearn.neighbors import KNeighborsClassifier

#GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier

#LogisticRegressionCV
from sklearn.linear_model import LogisticRegressionCV

#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier


In [98]:
#loading the data from CSV file 
data=pd.read_csv('finalDataset.csv')


In [99]:
# Create a MinMaxScaler object for numrical data
scaler = MinMaxScaler()

# Scaling the raw input features 
feature_cols=data.columns[:-1]
X= scaler.fit_transform(data[feature_cols])

print(f"The range of feature inputs are within {X.min()} to {X.max()}")

The range of feature inputs are within 0.0 to 1.0


In [100]:

from sklearn.model_selection import StratifiedShuffleSplit

# Get the split indexes
strat_shuf_split = StratifiedShuffleSplit(n_splits=1, 
                                          test_size=0.3, random_state=0)

train_idx, test_idx = next(strat_shuf_split.split(data[feature_cols], data['Rate']))

# Create the dataframes
X_train = data.loc[train_idx, feature_cols]
y_train = data.loc[train_idx, 'Rate']

X_test  = data.loc[test_idx, feature_cols]
y_test  = data.loc[test_idx, 'Rate']

print(f"Training dataset shape, X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Testing dataset shape, X_test: {X_test.shape}, y_test: {y_test.shape}")

Training dataset shape, X_train: (217, 19), y_train: (217,)
Testing dataset shape, X_test: (94, 19), y_test: (94,)


In [101]:
y_train.value_counts(normalize=True)

5    0.718894
4    0.161290
3    0.092166
1    0.009217
0    0.009217
2    0.009217
Name: Rate, dtype: float64

In [102]:
y_test.value_counts(normalize=True)

5    0.723404
4    0.159574
3    0.085106
2    0.010638
1    0.010638
0    0.010638
Name: Rate, dtype: float64

In [103]:
# DecisionTreeClassifier
#importing the classfier
clf=DecisionTreeClassifier(random_state=0)
clf2=clf.fit(X_train,y_train)

pred = clf2.predict(X_test)

In [104]:
# DecisionTree opt


#optimization
param_grid={
"max_depth":[2,4,6],
"min_samples_split":[2,5,10],
"min_samples_leaf":[1,2,4]}
grid_search= GridSearchCV(estimator=clf,param_grid=param_grid,cv=5)

grid_search.fit(X_train,y_train)
print("Best hyper-param: ",grid_search.best_params_ )
print("Best estimator: ",grid_search.best_estimator_ )
print("Best score: ",grid_search.best_score_ )



Best hyper-param:  {'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best estimator:  DecisionTreeClassifier(max_depth=6, random_state=0)
Best score:  0.977061310782241


In [105]:
# KNN
#importing the classfier

kCls=KNeighborsClassifier(n_neighbors=9)
kCls.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=9)

In [106]:
# GradientBoostingClassifier
#importing the classfier

gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.1, max_features=4, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_test)

In [107]:
#LogisticRegressionCV
lr_l1 = LogisticRegressionCV(Cs=10, cv=4, penalty='l1', solver='liblinear').fit(X_train, y_train)



In [108]:
#RandomForestClassifier
RF= RandomForestClassifier(criterion="gini",
                           max_depth=8,
                           min_samples_split=10,
                           random_state= 200)

In [109]:

def report( y_test, pred ):
    #report 
    print(classification_report(y_test,pred,target_names=['0','1','2','3','4','5']))


def confusionMatrix():
    #confusion_matrix
    #the result will show how mwny sucessful predition and wrong from each class

    cm = confusion_matrix(y_test, pred)
    plt.figure(figsize=(10,7))

    sns.heatmap(cm, annot=True, fmt='.2g', cmap='Blues')

    # TN   FP
    # FN   TP

In [110]:


models=[clf2,kCls,gb_clf2,gb_clf2,lr_l1,RF]

labels=['DecisionTreeClassifier','KNeighborsClassifier','GradientBoostingClassifier','LogisticRegressionCV','RandomForestClassifier']

metrics=[]


def PredictionAndPerformance():
    y_pred =[]
    
    for lab,mod in zip(labels, models):
        y_pred.append(pd.Series(mod.predict(X_test), name=lab))

    y_pred = pd.concat(y_pred, axis=1)

    for i in models:
        # Preciision, recall, f-score from the multi-class support function
        precision, recall, fscore, _ = score(y_test, y_pred[i], average='weighted')

        # The usual way to calculate accuracy
        accuracy = accuracy_score(y_test, y_pred[i])
        metrics.append(pd.Series({'precision':precision, 'recall':recall,'fscore':fscore, 'accuracy':accuracy}))

    metrics = pd.concat(metrics, axis=1)
    metrics

PredictionAndPerformance()



KeyError: DecisionTreeClassifier(random_state=0)