# Classification ML Quiz

In [1]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# preliminary assessment function

def assess_data(file, encoding='utf-8'):
    
    # read file
    df = pd.read_csv(file, encoding=encoding)
    
    # check header
    print("The First Five Observations in DataFrame")
    display(df.head())
    print('\n')
    
    # check tail
    print("The Last Five Observations in DataFrame")
    display(df.tail())
    print('\n')
    
    # check shape of df
    print("The Shape of DataFrame")
    print(df.shape)
    print('\n')
    
    # check info of df
    print("Basic Information of DataFrame")
    print('\n')
    display(df.info())
    print('\n')
    
    # check number of unique values in df
    print("Number of Unique Values in DataFrame")
    print('\n')
    print(df.nunique())
    print('\n')
    
    # check number of missing values in df
    print("Number of Missing Values in DataFrame")
    print('\n')
    print(df.isnull().sum())
    print('\n')
    
    # check number duplicates in df
    print("Number of Duplicates in DataFrame")
    print("Number of duplicates: ", df.duplicated().sum())
    
    return df

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv'

# load data and assess data
df = assess_data(url)

The First Five Observations in DataFrame


Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable




The Last Five Observations in DataFrame


Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892,unstable
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.28588,0.36612,-0.025803,stable
9997,2.364034,2.84203,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,-0.03181,stable
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.96633,-0.649915,-0.89851,0.365246,0.587558,0.889118,0.818391,0.037789,unstable
9999,6.530527,6.78179,4.349695,8.673138,3.492807,-1.390285,-1.532193,-0.570329,0.073056,0.505441,0.378761,0.942631,0.045263,unstable




The Shape of DataFrame
(10000, 14)


Basic Information of DataFrame


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


None



Number of Unique Values in DataFrame


tau1     10000
tau2     10000
tau3     10000
tau4     10000
p1       10000
p2       10000
p3       10000
p4       10000
g1       10000
g2       10000
g3       10000
g4       10000
stab     10000
stabf        2
dtype: int64


Number of Missing Values in DataFrame


tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64


Number of Duplicates in DataFrame
Number of duplicates:  0


In [4]:
# make a copy
df_class = df.copy()

In [5]:
# drop stab column
df_class = df_class.drop('stab', axis=1)
df_class.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stabf'],
      dtype='object')

In [6]:
# divide data to feature and target
X = df_class.drop(columns='stabf')
y = df_class['stabf']

In [7]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# transform training data
normalised_train_df = scaler.fit_transform(x_train)

# transform test data
x_test = x_test.reset_index(drop=True)
normalised_test_df = scaler.transform(x_test)

In [9]:
# import classification models and performance metrics
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import lightgbm as lgb

### Random Forest

In [10]:
# Random forest
rand_model = RandomForestClassifier(random_state=1)
rand_model.fit(normalised_train_df, y_train)
rand_pred = rand_model.predict(normalised_test_df)

In [11]:
# Confusion Matrix
cnf_mat = confusion_matrix(y_true=y_test, y_pred=rand_pred, labels=['unstable', 'stable'])
cnf_mat

array([[1233,   55],
       [  87,  625]], dtype=int64)

In [12]:
# Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=rand_pred)
print('Accuracy: {}'.format(round(accuracy*100), 2))

Accuracy: 93


In [13]:
# Precision
precision = precision_score(y_true=y_test, y_pred=rand_pred, pos_label='unstable')
print('Precision: {}'.format(round(precision*100), 2))

Precision: 93


In [14]:
# Recall
recall = recall_score(y_true=y_test, y_pred=rand_pred, pos_label='unstable')
print('Recall: {}'.format(round(recall*100), 2))

Recall: 96


In [15]:
# F1-Score
f1 = f1_score(y_true=y_test, y_pred=rand_pred, pos_label='unstable')
print('F1: {}'.format(round(f1*100), 2))

F1: 95


In [16]:
# Cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rand_model, normalised_train_df, y_train, cv=5, scoring='f1_macro')
scores

array([0.91143756, 0.91136454, 0.91564855, 0.90214725, 0.91555674])

### Extra Tree

In [17]:
# Extra tree
extra_model = ExtraTreesClassifier(random_state=1)
extra_model.fit(normalised_train_df, y_train)
extra_pred = extra_model.predict(normalised_test_df)
extra_pred[:10]

array(['unstable', 'unstable', 'stable', 'stable', 'unstable', 'stable',
       'unstable', 'unstable', 'unstable', 'stable'], dtype=object)

In [18]:
# Confusion Matrix
cnf_mat = confusion_matrix(y_true=y_test, y_pred=extra_pred, labels=['unstable', 'stable'])
cnf_mat

array([[1250,   38],
       [ 106,  606]], dtype=int64)

In [19]:
# Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=extra_pred)
print('Accuracy: {}'.format(round(accuracy*100), 2))

Accuracy: 93


In [20]:
# Precision
precision = precision_score(y_true=y_test, y_pred=extra_pred, pos_label='unstable')
print('Precision: {}'.format(round(precision*100), 2))

Precision: 92


In [21]:
# Recall
recall = recall_score(y_true=y_test, y_pred=extra_pred, pos_label='unstable')
print('Recall: {}'.format(round(recall*100), 2))

Recall: 97


In [22]:
# F1-Score
f1 = f1_score(y_true=y_test, y_pred=extra_pred, pos_label='unstable')
print('F1: {}'.format(round(f1*100), 2))

F1: 95


In [23]:
# Cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(extra_model, normalised_train_df, y_train, cv=5, scoring='f1_macro')
scores

array([0.91104788, 0.90890596, 0.91695487, 0.91218763, 0.92433757])

### Xgboost

In [24]:
# Xgboost
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train_x = encoder.fit_transform(y_train)
y_test_x = encoder.transform(y_test)
xgb_model = xgb.XGBClassifier(random_state=1)
xgb_model.fit(normalised_train_df, y_train_x)
xgb_pred = xgb_model.predict(normalised_test_df)
xgb_pred[:10]

array([1, 1, 0, 0, 1, 0, 1, 1, 1, 0])

In [25]:
# Confusion Matrix
cnf_mat = confusion_matrix(y_true=y_test_x, y_pred=xgb_pred, labels=[1, 0])
cnf_mat

array([[1243,   45],
       [  64,  648]], dtype=int64)

In [38]:
# Accuracy
accuracy = accuracy_score(y_true=y_test_x, y_pred=xgb_pred)
print('Accuracy: {}'.format(round(accuracy*100), 2))

Accuracy: 95


In [27]:
# Precision
precision = precision_score(y_true=y_test_x, y_pred=xgb_pred, pos_label=1)
print('Precision: {}'.format(round(precision*100), 2))

Precision: 95


In [28]:
# Recall
recall = recall_score(y_true=y_test_x, y_pred=xgb_pred, pos_label=1)
print('Recall: {}'.format(round(recall*100), 2))

Recall: 97


In [29]:
# F1-Score
f1 = f1_score(y_true=y_test_x, y_pred=xgb_pred, pos_label=1)
print('F1: {}'.format(round(f1*100), 2))

F1: 96


In [30]:
# Cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgb_model, normalised_train_df, y_train_x, cv=5, scoring='f1_macro')
scores

array([0.9366625 , 0.94470459, 0.93293748, 0.93646215, 0.94718274])

### Lightgbm

In [31]:
# Lightgbm
lgb_model = lgb.LGBMClassifier(random_state=1)
lgb_model.fit(normalised_train_df, y_train)
lgb_pred = lgb_model.predict(normalised_test_df)
lgb_pred[:10]

array(['unstable', 'unstable', 'stable', 'stable', 'unstable', 'stable',
       'unstable', 'unstable', 'unstable', 'stable'], dtype=object)

In [32]:
# Confusion Matrix
cnf_mat = confusion_matrix(y_true=y_test, y_pred=lgb_pred, labels=['unstable', 'stable'])
cnf_mat

array([[1238,   50],
       [  71,  641]], dtype=int64)

In [33]:
# Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=lgb_pred)
print('Accuracy: {}'.format(round(accuracy*100), 2))

Accuracy: 94


In [34]:
# Precision
precision = precision_score(y_true=y_test, y_pred=lgb_pred, pos_label='unstable')
print('Precision: {}'.format(round(precision*100), 2))

Precision: 95


In [35]:
# Recall
recall = recall_score(y_true=y_test, y_pred=lgb_pred, pos_label='unstable')
print('Recall: {}'.format(round(recall*100), 2))

Recall: 96


In [36]:
# F1-Score
f1 = f1_score(y_true=y_test, y_pred=lgb_pred, pos_label='unstable')
print('F1: {}'.format(round(f1*100), 2))

F1: 95


In [37]:
# Cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lgb_model, normalised_train_df, y_train, cv=5, scoring='f1_macro')
scores

array([0.92843386, 0.94466058, 0.93147894, 0.92695545, 0.94360434])