## Library

In [None]:
import numpy as np
import pandas as pd
import time
import random
import functools
import sys

## Preprocessing

In [None]:
def get_mode(column):
    mode = []
    appear = dict((a, column.count(a)) for a in column)   # count appearance times of each key
    if max(appear.values()) == 1:       # if max time is 1
        return      # no mode here
    else:
        for k, v in appear.items():     # else, mode is the number which has max time
            if v == max(appear.values()):
                mode.append(k)
    return mode[0]  # return first number if has many modes

def fill_missing_values(dataframe_list, column_no):     # dataframe in the form of a list of lists
    size = len(dataframe_list)
    column_data = [x[column_no] for x in dataframe_list]      # get that column
    while '?' in column_data:
        column_data.remove('?')
    mode = get_mode(column_data)
    for i in range(size):
        if dataframe_list[i][column_no] == '?':
            dataframe_list[i][column_no] = mode              # fill in mode
    return dataframe_list

def get_discretization_data(data_column, class_label_column):
    size = len(data_column)
    result_list = []
    for i in range(size):
        result_list.append([data_column[i], class_label_column[i]])
    return result_list

def replace_numerical(dataframe_list, column_no, walls):
    size = len(dataframe_list)
    num_spilt_point = len(walls)
    for i in range(size):
        if dataframe_list[i][column_no] > walls[num_spilt_point - 1]:
            dataframe_list[i][column_no] = num_spilt_point + 1
            continue
        for j in range(0, num_spilt_point):
            if dataframe_list[i][column_no] <= walls[j]:
                dataframe_list[i][column_no] = j + 1
                break
    return dataframe_list

def replace_categorical(dataframe_list, column_no):
    size = len(dataframe_list)
    classes = set([x[column_no] for x in dataframe_list])
    classes_no = dict([(label, 0) for label in classes])
    j = 1
    for i in classes:
        classes_no[i] = j
        j += 1
    for i in range(size):
        dataframe_list[i][column_no] = classes_no[dataframe_list[i][column_no]]
    return dataframe_list, classes_no

def discard(dataframe_list, discard_list):
    size = len(dataframe_list)
    length = len(dataframe_list[0])
    data_result = []
    for i in range(size):
        data_result.append([])
        for j in range(length):
            if j not in discard_list:
                data_result[i].append(dataframe_list[i][j])
    return data_result

def pre_process(dataframe_list, attribute, value_type):
    column_num = len(dataframe_list[0])
    size = len(dataframe_list)
    class_column = [x[-1] for x in dataframe_list]
    discard_list = []
    for i in range(0, column_num - 1):
        data_column = [x[i] for x in dataframe_list]

        # process missing values
        missing_values_ratio = data_column.count('?') / size
        if missing_values_ratio > 0.5:
            discard_list.append(i)
            continue
        elif missing_values_ratio > 0:
            dataframe_list = fill_missing_values(dataframe_list, i)
            data_column = [x[i] for x in dataframe_list]

        # discretization
        if value_type[i] == 'numerical':
            discretization_data = get_discretization_data(data_column, class_column)
            block = Block(discretization_data)
            walls = partition(block)
            if len(walls) == 0:
                max_value = max(data_column)
                min_value = min(data_column)
                step = (max_value - min_value) / 3
                walls.append(min_value + step)
                walls.append(min_value + 2 * step)
            print(attribute[i] , ":", walls)        # print out split points
            dataframe_list = replace_numerical(dataframe_list, i, walls)
        elif value_type[i] == 'categorical':
            dataframe_list, classes_no = replace_categorical(dataframe_list, i)
            print(attribute[i] , ":", classes_no)   # print out replacement list

    # discard
    if len(discard_list) > 0:
        dataframe_list = discard(data, discard_list)
        print("discard:", discard_list)             # print out discard list
    return dataframe_list


In [None]:
import csv


# Read dataset and convert into a list.
# path: directory of *.data file.
def read_data(path):
    data = []
    with open(path, 'r') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        for line in reader:
            data.append(line)
        while [] in data:
            data.remove([])
    return data


# Read scheme file *.names and write down attributes and value types.
# path: directory of *.names file.
def read_scheme(path):
    with open(path, 'r') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        attributes = next(reader)
        value_type = next(reader)
    return attributes, value_type


# convert string-type value into float-type.
# data: data list returned by read_data.
# value_type: list returned by read_scheme.
def str2numerical(data, value_type):
    #ct = 0
    #print("str2numerical")
    size = len(data) 
    columns = len(data[0])
    for i in range(size):
        #ct = 0
        for j in range(columns - 1):
            if value_type[j] == 'numerical' and data[i][j] != '?':
                #ct += 1
                data[i][j] = float(data[i][j])
        #print("ct: ",ct)
    return data


# Main method in this file, to get data list after processing and scheme list.
# data_path: tell where *.data file stores.
# scheme_path: tell where *.names file stores.
def read(data_path, scheme_path):
    data = read_data(data_path)
    attributes, value_type = read_scheme(scheme_path)
    data = str2numerical(data, value_type)
    return data, attributes, value_type

In [None]:
import math


# A block to be split
# It has 4 member:
#   data: the data table with a column of continuous-valued attribute and a column of class label
#   size: number of data case in this table
#   number_of_classes: obviously, the number of class in this table
#   entropy: entropy of dataset
class Block:
    def __init__(self, data):
        self.data = data
        self.size = len(data)
        classes = set([x[1] for x in data])     # get distinct class labels in this table
        self.number_of_classes = len(set(classes))
        self.entropy = calculate_entropy(data)


# Calculate the entropy of dataset
# parameter data: the data table to be used
def calculate_entropy(data):
    number_of_data = len(data)
    classes = set([x[1] for x in data])
    class_count = dict([(label, 0) for label in classes])
    for data_case in data:
        class_count[data_case[1]] += 1      # count the number of data case of each class
    entropy = 0
    for c in classes:
        p = class_count[c] / number_of_data
        entropy -= p * math.log2(p)         # calculate information entropy by its formula, where the base is 2
    return entropy


# Compute Gain(A, T: S) mentioned in Dougherty, Kohavi & Sahami (1995), i.e. entropy gained by splitting original_block
#   into left_block and right_block
# original_block: the block before partition
# left_block: the block split which its value below boundary
# right_block: the block above boundary
def entropy_gain(original_block, left_block, right_block):
    gain = original_block.entropy - \
            ((left_block.size / original_block.size) * left_block.entropy +
            (right_block.size / original_block.size) * right_block.entropy)
    return gain


# Get minimum entropy gain required for a split of original_block into 2 blocks "left" and "right", see Dougherty,
#   Kohavi & Sahami (1995)
# original_block: the block before partition
# left_block: the block split which its value below boundary
# right_block: the block above boundary
def min_gain(original_block, left_block, right_block):
    delta = math.log2(math.pow(3, original_block.number_of_classes) - 2) - \
            (original_block.number_of_classes * original_block.entropy -
             left_block.number_of_classes * left_block.entropy -
             right_block.number_of_classes * right_block.entropy)
    gain_sup = math.log2(original_block.size - 1) / original_block.size + delta / original_block.size
    return gain_sup


# Identify the best acceptable value to split block
# block: a block of dataset
# Return value: a list of (boundary, entropy gain, left block, right block) or
#   None when it's unnecessary to split
def split(block):
    candidates = [x[0] for x in block.data]     # candidates is a list of values can be picked up as boundary
    candidates = list(set(candidates))          # get different values in table
    candidates.sort()                           # sort ascending
    candidates = candidates[1:]                 # discard smallest, because by definition no value is smaller

    wall = []       # wall is a list storing final boundary
    for value in candidates:
        # split by value into 2 groups, below & above
        left_data = []
        right_data = []
        for data_case in block.data:
            if data_case[0] < value:
                left_data.append(data_case)
            else:
                right_data.append(data_case)

        left_block = Block(left_data)
        right_block = Block(right_data)

        gain = entropy_gain(block, left_block, right_block)
        threshold = min_gain(block, left_block, right_block)

        # minimum threshold is met, the value is an acceptable candidate
        if gain >= threshold:
            wall.append([value, gain, left_block, right_block])

    if wall:    # has candidate
        wall.sort(key=lambda wall: wall[1], reverse=True)   # sort descending by "gain"
        return wall[0]      # return best candidate with max entropy gain
    else:
        return None         # no need to split


# Top-down recursive partition of a data block, append boundary into "walls"
# block: a data block
def partition(block):
    walls = []

    # inner recursive function, accumulate the partitioning values
    # sub_block: just a data block
    def recursive_split(sub_block):
        wall_returned = split(sub_block)        # binary partition, get bin boundary
        if wall_returned:                       # still can be spilt
            walls.append(wall_returned[0])      # record this partitioning value
            recursive_split(wall_returned[2])   # recursively process left block
            recursive_split(wall_returned[3])   # recursively split right block
        else:
            return                              # end of recursion

    recursive_split(block)      # call inner function
    walls.sort()                # sort boundaries descending
    return walls

## Iris.csv

In [None]:
df = pd.read_csv('../dataset/iris.csv', header= None)
df.head()
rows = len(df)
cols = len(df.columns)

df = df.values.tolist() #convert dataframe to list of lists
test_attribute = [0,1,2,3,4]
test_value_type = ['numerical', 'numerical', 'numerical','numerical', 'label']
test_data_after = pre_process(df, test_attribute, test_value_type)

0 : [5.6, 6.2]
1 : [3.0, 3.4]
2 : [3.0, 4.8]
3 : [1.0, 1.8]


### Random Forest

In [None]:
data = pd.DataFrame (test_data_after)

In [None]:
data.head()

Unnamed: 0,0,1,2,3,4
0,1,3,1,1,Iris-setosa
1,1,1,1,1,Iris-setosa
2,1,2,1,1,Iris-setosa
3,1,2,1,1,Iris-setosa
4,1,3,1,1,Iris-setosa


In [None]:
data[4].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [None]:
X = data.drop(4, axis=1)
y = data[4]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=90)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=90)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
pred_test = rf_clf.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, pred_test, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, pred_test, average = 'micro')}\n")

Test Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
           Iris-setosa  Iris-versicolor  Iris-virginica  accuracy  macro avg  \
precision          1.0              1.0             1.0       1.0        1.0   
recall             1.0              1.0             1.0       1.0        1.0   
f1-score           1.0              1.0             1.0       1.0        1.0   
support           19.0             15.0            16.0       1.0       50.0   

           weighted avg  
precision           1.0  
recall              1.0  
f1-score            1.0  
support            50.0  
_______________________________________________
Confusion Matrix: 
 [[19  0  0]
 [ 0 15  0]
 [ 0  0 16]]

_______________________________________________
f1_score: 
 1.0



### Logistic Regression

In [None]:
X = data.drop(4, axis=1)
y = data[4]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_lib = LogisticRegression(solver='liblinear') #good choice for smaller dataset
lr_lib_fit = lr_lib.fit(X_train,y_train)
lr_lib_fit

LogisticRegression(solver='liblinear')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
pred_test = lr_lib_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, pred_test, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, pred_test, average = 'micro')}\n")

Test Result:
Accuracy Score: 90.00%
_______________________________________________
CLASSIFICATION REPORT:
           Iris-setosa  Iris-versicolor  Iris-virginica  accuracy  macro avg  \
precision     0.950000         1.000000        0.800000       0.9   0.916667   
recall        1.000000         0.666667        1.000000       0.9   0.888889   
f1-score      0.974359         0.800000        0.888889       0.9   0.887749   
support      19.000000        15.000000       16.000000       0.9  50.000000   

           weighted avg  
precision      0.917000  
recall         0.900000  
f1-score       0.894701  
support       50.000000  
_______________________________________________
Confusion Matrix: 
 [[19  0  0]
 [ 1 10  4]
 [ 0  0 16]]

_______________________________________________
f1_score: 
 0.9



In [None]:
lr_saga = LogisticRegression(solver='saga', max_iter = 1000) # Used for multi class problem
lr_saga_fit = lr_saga.fit(X_train,y_train)
lr_saga_fit

LogisticRegression(max_iter=1000, solver='saga')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
pred_test =lr_saga_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, pred_test, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, pred_test, average = 'micro')}\n")

Test Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
           Iris-setosa  Iris-versicolor  Iris-virginica  accuracy  macro avg  \
precision          1.0              1.0             1.0       1.0        1.0   
recall             1.0              1.0             1.0       1.0        1.0   
f1-score           1.0              1.0             1.0       1.0        1.0   
support           19.0             15.0            16.0       1.0       50.0   

           weighted avg  
precision           1.0  
recall              1.0  
f1-score            1.0  
support            50.0  
_______________________________________________
Confusion Matrix: 
 [[19  0  0]
 [ 0 15  0]
 [ 0  0 16]]

_______________________________________________
f1_score: 
 1.0



## wine.csv

In [None]:
df = pd.read_excel('../dataset/wine.xlsx', header= None)
df.head()
rows = len(df)
cols = len(df.columns)


df[14] = df[0]
for i in range(cols):
  df[i] = df[i+1]
df = df.drop([14], axis = 1)

df.head(5)

df = df.values.tolist() #convert dataframe to list of lists
test_attribute = []
for i in range(cols):
  test_attribute.append(i)
print(test_attribute)
test_value_type = []
for i in range(cols-1):
  test_value_type.append('numerical')
test_value_type.append('label')

test_data_after = pre_process(df, test_attribute, test_value_type)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
0 : [12.2, 12.79]
1 : [1.43, 2.31]
2 : [2.04]
3 : [18.0]
4 : [89.0]
5 : [1.85, 2.35]
6 : [0.99, 1.58, 2.33]
7 : [0.4]
8 : [1.28]
9 : [3.52, 7.6]
10 : [0.79, 0.98, 1.31]
11 : [2.12, 2.48]
12 : [470.0, 760.0, 990.0]


In [None]:
data = pd.DataFrame (test_data_after)

In [None]:
X = data.drop(13, axis=1)
y = data[13]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=90)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=90)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
pred_test = rf_clf.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, pred_test, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, pred_test, average = 'micro')}\n")


Test Result:
Accuracy Score: 98.31%
_______________________________________________
CLASSIFICATION REPORT:
                 1.0        2.0   3.0  accuracy  macro avg  weighted avg
precision   1.000000   0.960000   1.0  0.983051   0.986667      0.983729
recall      0.950000   1.000000   1.0  0.983051   0.983333      0.983051
f1-score    0.974359   0.979592   1.0  0.983051   0.984650      0.983007
support    20.000000  24.000000  15.0  0.983051  59.000000     59.000000
_______________________________________________
Confusion Matrix: 
 [[19  1  0]
 [ 0 24  0]
 [ 0  0 15]]

_______________________________________________
f1_score: 
 0.9830508474576272



### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_lib = LogisticRegression(solver='liblinear') #good choice for smaller dataset
lr_lib_fit = lr_lib.fit(X_train,y_train)
lr_lib_fit

LogisticRegression(solver='liblinear')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# Didn't tune the hyperparameter
lr_lib_pred_test = lr_lib_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_lib_pred_test, output_dict=True))
print("Test Result for liblinear solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_lib_pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_lib_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_lib_pred_test, average = 'micro')}\n")


Test Result for liblinear solver:
Accuracy Score: 96.61%
_______________________________________________
CLASSIFICATION REPORT:
             1.0        2.0   3.0  accuracy  macro avg  weighted avg
precision   0.95   0.958333   1.0  0.966102   0.969444      0.966102
recall      0.95   0.958333   1.0  0.966102   0.969444      0.966102
f1-score    0.95   0.958333   1.0  0.966102   0.969444      0.966102
support    20.00  24.000000  15.0  0.966102  59.000000     59.000000
_______________________________________________
Confusion Matrix: 
 [[19  1  0]
 [ 1 23  0]
 [ 0  0 15]]

_______________________________________________
f1_score: 
 0.9661016949152542



In [None]:
lr_saga = LogisticRegression(solver='saga') # Used for multi class problem
lr_saga_fit = lr_saga.fit(X_train,y_train)
lr_saga_fit



LogisticRegression(solver='saga')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# Didn't tune the hyperparameter
lr_saga_pred_test = lr_saga_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_saga_pred_test, output_dict=True))
print("Test Result for saga solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_saga_pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_saga_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_saga_pred_test, average = 'micro')}\n")

Test Result for saga solver:
Accuracy Score: 96.61%
_______________________________________________
CLASSIFICATION REPORT:
             1.0        2.0   3.0  accuracy  macro avg  weighted avg
precision   0.95   0.958333   1.0  0.966102   0.969444      0.966102
recall      0.95   0.958333   1.0  0.966102   0.969444      0.966102
f1-score    0.95   0.958333   1.0  0.966102   0.969444      0.966102
support    20.00  24.000000  15.0  0.966102  59.000000     59.000000
_______________________________________________
Confusion Matrix: 
 [[19  1  0]
 [ 1 23  0]
 [ 0  0 15]]

_______________________________________________
f1_score: 
 0.9661016949152542



## glass.csv

In [None]:
df = pd.read_csv('../dataset/glass.csv', header= None)
df.head()
rows = len(df)
cols = len(df.columns)

df = df.values.tolist() #convert dataframe to list of lists
test_attribute = []
for i in range(cols):
  test_attribute.append(i)
print(test_attribute)
test_value_type = []
for i in range(cols-1):
  test_value_type.append('numerical')
test_value_type.append('label')
test_data_after = pre_process(df, test_attribute, test_value_type)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
0 : [71.0, 147.0, 164.0, 177.0, 186.0]
1 : [1.51735, 1.518]
2 : [14.09]
3 : [2.71]
4 : [1.4, 1.79]
5 : [71.67666666666666, 73.54333333333334]
6 : [0.06, 0.62, 0.76]
7 : [7.08, 8.32, 10.09]
8 : [0.4]
9 : [0.17, 0.34]


In [None]:
data = pd.DataFrame (test_data_after)

In [None]:
X = data.drop(10, axis=1)
y = data[10]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=90)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=90)

In [None]:
# Predict the class label using X_test
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
pred_test = rf_clf.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, pred_test, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, pred_test, average = 'micro')}\n")

Test Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
            1.0   2.0  3.0  5.0  6.0   7.0  accuracy  macro avg  weighted avg
precision   1.0   1.0  1.0  1.0  1.0   1.0       1.0        1.0           1.0
recall      1.0   1.0  1.0  1.0  1.0   1.0       1.0        1.0           1.0
f1-score    1.0   1.0  1.0  1.0  1.0   1.0       1.0        1.0           1.0
support    22.0  25.0  4.0  6.0  4.0  10.0       1.0       71.0          71.0
_______________________________________________
Confusion Matrix: 
 [[22  0  0  0  0  0]
 [ 0 25  0  0  0  0]
 [ 0  0  4  0  0  0]
 [ 0  0  0  6  0  0]
 [ 0  0  0  0  4  0]
 [ 0  0  0  0  0 10]]

_______________________________________________
f1_score: 
 1.0



### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_lib = LogisticRegression(solver='liblinear') #good choice for smaller dataset
lr_lib_fit = lr_lib.fit(X_train,y_train)
lr_lib_fit

LogisticRegression(solver='liblinear')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
lr_lib_pred_test = lr_lib_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_lib_pred_test, output_dict=True))
print("Test Result for liblinear solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_lib_pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_lib_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_lib_pred_test, average = 'micro')}\n")

Test Result for liblinear solver:
Accuracy Score: 83.10%
_______________________________________________
CLASSIFICATION REPORT:
                 1.0        2.0       3.0       5.0   6.0        7.0  \
precision   0.954545   0.733333  0.666667  1.000000  0.75   0.909091   
recall      0.954545   0.880000  0.500000  0.166667  0.75   1.000000   
f1-score    0.954545   0.800000  0.571429  0.285714  0.75   0.952381   
support    22.000000  25.000000  4.000000  6.000000  4.00  10.000000   

           accuracy  macro avg  weighted avg  
precision  0.830986   0.835606      0.846351  
recall     0.830986   0.708535      0.830986  
f1-score   0.830986   0.719012      0.810195  
support    0.830986  71.000000     71.000000  
_______________________________________________
Confusion Matrix: 
 [[21  1  0  0  0  0]
 [ 1 22  1  0  1  0]
 [ 0  2  2  0  0  0]
 [ 0  5  0  1  0  0]
 [ 0  0  0  0  3  1]
 [ 0  0  0  0  0 10]]

_______________________________________________
f1_score: 
 0.8309859154929577



In [None]:
lr_saga = LogisticRegression(solver='saga') # Used for multi class problem
lr_saga_fit = lr_saga.fit(X_train,y_train)
lr_saga_fit



LogisticRegression(solver='saga')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# Didn't tune the hyperparameter
lr_saga_pred_test = lr_saga_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_saga_pred_test, output_dict=True))
print("Test Result for saga solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_saga_pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_saga_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_saga_pred_test, average = 'micro')}\n")

Test Result for saga solver:
Accuracy Score: 95.77%
_______________________________________________
CLASSIFICATION REPORT:
            1.0        2.0       3.0       5.0  6.0   7.0  accuracy  \
precision   1.0   0.925926  0.666667  1.000000  1.0   1.0  0.957746   
recall      1.0   1.000000  0.500000  0.833333  1.0   1.0  0.957746   
f1-score    1.0   0.961538  0.571429  0.909091  1.0   1.0  0.957746   
support    22.0  25.000000  4.000000  6.000000  4.0  10.0  0.957746   

           macro avg  weighted avg  
precision   0.932099      0.955138  
recall      0.888889      0.957746  
f1-score    0.907010      0.954630  
support    71.000000     71.000000  
_______________________________________________
Confusion Matrix: 
 [[22  0  0  0  0  0]
 [ 0 25  0  0  0  0]
 [ 0  2  2  0  0  0]
 [ 0  0  1  5  0  0]
 [ 0  0  0  0  4  0]
 [ 0  0  0  0  0 10]]

_______________________________________________
f1_score: 
 0.9577464788732394



## Tic-Tac-Toe.csv

In [None]:
data_path = '../dataset/tic-tac-toe.data' # only change filename here for different datasets
scheme_path = '../dataset/tic-tac-toe.names'
data, attributes, value_type = read(data_path, scheme_path)
#random.shuffle(data)
df = pre_process(data, attributes, value_type)
# print(type(df))
# print(df)

top-left-square : {'b': 1, 'o': 2, 'x': 3}
top-middle-square : {'b': 1, 'o': 2, 'x': 3}
top-right-square : {'b': 1, 'o': 2, 'x': 3}
middle-left-square : {'b': 1, 'o': 2, 'x': 3}
middle-middle-square : {'x': 1, 'o': 2, 'b': 3}
middle-right-square : {'x': 1, 'o': 2, 'b': 3}
bottom-left-square : {'b': 1, 'o': 2, 'x': 3}
bottom-middle-square : {'b': 1, 'o': 2, 'x': 3}
bottom-right-square : {'b': 1, 'o': 2, 'x': 3}


In [None]:
df = pd.DataFrame(df)

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,3,3,3,3,2,2,3,2,2,positive
1,3,3,3,3,2,2,2,3,2,positive
2,3,3,3,3,2,2,2,2,3,positive
3,3,3,3,3,2,2,2,1,1,positive
4,3,3,3,3,2,2,1,2,1,positive


In [None]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0
mean,2.222338,2.133612,2.222338,2.133612,1.688935,1.866388,2.222338,2.133612,2.222338
std,0.775569,0.798966,0.775569,0.798966,0.740882,0.798966,0.775569,0.798966,0.775569
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0
50%,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
75%,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0
max,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


In [None]:
df[9].unique()

array(['positive', 'negative'], dtype=object)

### Random Forest

In [None]:
X = df.drop(9, axis=1)
y = df[9]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Check the sample sizes
print("Train Set :", X_train.shape, y_train.shape)
print("Test Set  :", X_test.shape, y_test.shape)

Train Set : (641, 9) (641,)
Test Set  : (317, 9) (317,)


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=90)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=90)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
pred_test = rf_clf.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, pred_test, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred_test)} \n")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, pred_test, average = 'micro')}\n")


Test Result:
Accuracy Score: 0.9369085173501577 

_______________________________________________
CLASSIFICATION REPORT:
             negative    positive  accuracy   macro avg  weighted avg
precision    0.976744    0.922078  0.936909    0.949411      0.939668
recall       0.823529    0.990698  0.936909    0.907114      0.936909
f1-score     0.893617    0.955157  0.936909    0.924387      0.935355
support    102.000000  215.000000  0.936909  317.000000    317.000000
_______________________________________________
Confusion Matrix: 
 [[ 84  18]
 [  2 213]]

_______________________________________________
f1_score: 
 0.9369085173501577



### Logistic Regression

In [None]:
X = df.drop(9, axis=1)
y = df[9]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_lib = LogisticRegression(solver='liblinear') #good choice for smaller dataset
lr_lib_fit = lr_lib.fit(X_train,y_train)
lr_lib_fit

LogisticRegression(solver='liblinear')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# Didn't tune the hyperparameter
lr_lib_pred_test = lr_lib_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_lib_pred_test, output_dict=True))
print("Test Result for liblinear solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_lib_pred_test)}\n")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_lib_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_lib_pred_test, average = 'micro')}\n")

Test Result for liblinear solver:
Accuracy Score: 0.7066246056782335

_______________________________________________
CLASSIFICATION REPORT:
             negative    positive  accuracy   macro avg  weighted avg
precision    0.621622    0.717857  0.706625    0.669739      0.686892
recall       0.225490    0.934884  0.706625    0.580187      0.706625
f1-score     0.330935    0.812121  0.706625    0.571528      0.657292
support    102.000000  215.000000  0.706625  317.000000    317.000000
_______________________________________________
Confusion Matrix: 
 [[ 23  79]
 [ 14 201]]

_______________________________________________
f1_score: 
 0.7066246056782335



In [None]:
lr_saga = LogisticRegression(solver='saga') # Used for multi class problem
lr_saga_fit = lr_saga.fit(X_train,y_train)
lr_saga_fit



LogisticRegression(solver='saga')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# Didn't tune the hyperparameter
lr_saga_pred_test = lr_saga_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_saga_pred_test, output_dict=True))
print("Test Result for saga solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_saga_pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_saga_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_saga_pred_test, average = 'micro')}\n")


Test Result for saga solver:
Accuracy Score: 71.29%
_______________________________________________
CLASSIFICATION REPORT:
             negative    positive  accuracy   macro avg  weighted avg
precision    0.657143    0.719858  0.712934    0.688501      0.699678
recall       0.225490    0.944186  0.712934    0.584838      0.712934
f1-score     0.335766    0.816901  0.712934    0.576334      0.662088
support    102.000000  215.000000  0.712934  317.000000    317.000000
_______________________________________________
Confusion Matrix: 
 [[ 23  79]
 [ 12 203]]

_______________________________________________
f1_score: 
 0.7129337539432177



## zoo.csv

In [None]:
data_path = '../dataset/zoo.data' # only change filename here for different datasets
scheme_path = '../dataset/zoo.names'
data, attributes, value_type = read(data_path, scheme_path)
#random.shuffle(data)
df = pre_process(data, attributes, value_type)
# print(type(df))
# print(df)

animal name : {'crab': 1, 'pitviper': 2, 'deer': 3, 'sealion': 4, 'wren': 5, 'octopus': 6, 'carp': 7, 'hare': 8, 'tuatara': 9, 'lion': 10, 'mongoose': 11, 'reindeer': 12, 'oryx': 13, 'leopard': 14, 'cavy': 15, 'toad': 16, 'porpoise': 17, 'cheetah': 18, 'lark': 19, 'rhea': 20, 'herring': 21, 'goat': 22, 'chub': 23, 'skimmer': 24, 'mink': 25, 'bass': 26, 'mole': 27, 'skua': 28, 'sole': 29, 'seawasp': 30, 'puma': 31, 'pony': 32, 'duck': 33, 'dove': 34, 'seahorse': 35, 'raccoon': 36, 'catfish': 37, 'gnat': 38, 'tortoise': 39, 'pheasant': 40, 'vampire': 41, 'newt': 42, 'ladybird': 43, 'flea': 44, 'crayfish': 45, 'crow': 46, 'platypus': 47, 'tuna': 48, 'polecat': 49, 'boar': 50, 'housefly': 51, 'moth': 52, 'hawk': 53, 'antelope': 54, 'vole': 55, 'piranha': 56, 'aardvark': 57, 'penguin': 58, 'vulture': 59, 'swan': 60, 'sparrow': 61, 'giraffe': 62, 'parakeet': 63, 'dogfish': 64, 'stingray': 65, 'ostrich': 66, 'flamingo': 67, 'lobster': 68, 'squirrel': 69, 'gorilla': 70, 'frog': 71, 'starfish':

In [None]:
df = pd.DataFrame(df)

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,57,2,1,1,2,1,1,2,2,2,2,1,1,3,1,1,2,1
1,54,2,1,1,2,1,1,1,2,2,2,1,1,3,2,1,2,1
2,26,1,1,2,1,1,2,2,2,2,1,1,2,6,2,1,1,4
3,81,2,1,1,2,1,1,2,2,2,2,1,1,3,1,1,2,1
4,50,2,1,1,2,1,1,2,2,2,2,1,1,3,2,1,2,1


In [None]:
df[17].unique()

array(['1', '4', '2', '7', '6', '5', '3'], dtype=object)

### Random Forest

In [None]:
X = df.drop(17, axis=1)
y = df[17]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Check the sample sizes
print("Train Set :", X_train.shape, y_train.shape)
print("Test Set  :", X_test.shape, y_test.shape)

Train Set : (67, 17) (67,)
Test Set  : (34, 17) (34,)


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=90)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=90)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
pred_test = rf_clf.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, pred_test, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred_test)} \n")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, pred_test, average = 'micro')}\n")


Test Result:
Accuracy Score: 0.9411764705882353 

_______________________________________________
CLASSIFICATION REPORT:
              1    2    3         4         5    6    7  accuracy  macro avg  \
precision   1.0  1.0  0.0  0.666667  0.666667  1.0  1.0  0.941176   0.761905   
recall      1.0  1.0  0.0  1.000000  1.000000  1.0  1.0  0.941176   0.857143   
f1-score    1.0  1.0  0.0  0.800000  0.800000  1.0  1.0  0.941176   0.800000   
support    17.0  3.0  2.0  2.000000  2.000000  5.0  3.0  0.941176  34.000000   

           weighted avg  
precision      0.901961  
recall         0.941176  
f1-score       0.917647  
support       34.000000  
_______________________________________________
Confusion Matrix: 
 [[17  0  0  0  0  0  0]
 [ 0  3  0  0  0  0  0]
 [ 0  0  0  1  1  0  0]
 [ 0  0  0  2  0  0  0]
 [ 0  0  0  0  2  0  0]
 [ 0  0  0  0  0  5  0]
 [ 0  0  0  0  0  0  3]]

_______________________________________________
f1_score: 
 0.9411764705882353



  _warn_prf(average, modifier, msg_start, len(result))


### Logistic Regression

In [None]:
X = df.drop(17, axis=1)
y = df[17]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_lib = LogisticRegression(solver='liblinear') #good choice for smaller dataset
lr_lib_fit = lr_lib.fit(X_train,y_train)
lr_lib_fit

LogisticRegression(solver='liblinear')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# Didn't tune the hyperparameter
lr_lib_pred_test = lr_lib_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_lib_pred_test, output_dict=True))
print("Test Result for liblinear solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_lib_pred_test)}\n")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_lib_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_lib_pred_test, average = 'micro')}\n")


Test Result for liblinear solver:
Accuracy Score: 0.9411764705882353

_______________________________________________
CLASSIFICATION REPORT:
                   1    2         3         4    5    6         7  accuracy  \
precision   0.944444  1.0  1.000000  0.666667  1.0  1.0  1.000000  0.941176   
recall      1.000000  1.0  0.500000  1.000000  1.0  1.0  0.666667  0.941176   
f1-score    0.971429  1.0  0.666667  0.800000  1.0  1.0  0.800000  0.941176   
support    17.000000  3.0  2.000000  2.000000  2.0  5.0  3.000000  0.941176   

           macro avg  weighted avg  
precision   0.944444      0.952614  
recall      0.880952      0.941176  
f1-score    0.891156      0.936695  
support    34.000000     34.000000  
_______________________________________________
Confusion Matrix: 
 [[17  0  0  0  0  0  0]
 [ 0  3  0  0  0  0  0]
 [ 0  0  1  1  0  0  0]
 [ 0  0  0  2  0  0  0]
 [ 0  0  0  0  2  0  0]
 [ 0  0  0  0  0  5  0]
 [ 1  0  0  0  0  0  2]]

________________________________________

In [None]:
lr_saga = LogisticRegression(solver='saga') # Used for multi class problem
lr_saga_fit = lr_saga.fit(X_train,y_train)
lr_saga_fit



LogisticRegression(solver='saga')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# Didn't tune the hyperparameter
lr_saga_pred_test = lr_saga_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_saga_pred_test, output_dict=True))
print("Test Result for saga solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_saga_pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_saga_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_saga_pred_test, average = 'micro')}\n")


Test Result for saga solver:
Accuracy Score: 61.76%
_______________________________________________
CLASSIFICATION REPORT:
                   1     2    3         4    5    6    7  accuracy  macro avg  \
precision   0.640000  0.60  0.0  0.500000  0.0  0.0  0.0  0.617647   0.248571   
recall      0.941176  1.00  0.0  1.000000  0.0  0.0  0.0  0.617647   0.420168   
f1-score    0.761905  0.75  0.0  0.666667  0.0  0.0  0.0  0.617647   0.311224   
support    17.000000  3.00  2.0  2.000000  2.0  5.0  3.0  0.617647  34.000000   

           weighted avg  
precision      0.402353  
recall         0.617647  
f1-score       0.486345  
support       34.000000  
_______________________________________________
Confusion Matrix: 
 [[16  0  0  1  0  0  0]
 [ 0  3  0  0  0  0  0]
 [ 1  1  0  0  0  0  0]
 [ 0  0  0  2  0  0  0]
 [ 1  1  0  0  0  0  0]
 [ 5  0  0  0  0  0  0]
 [ 2  0  0  1  0  0  0]]

_______________________________________________
f1_score: 
 0.6176470588235294



  _warn_prf(average, modifier, msg_start, len(result))


## Teaching Assistant Evaluation

In [None]:
data_path = '../dataset/tae.data' # only change filename here for different datasets
scheme_path = '../dataset/tae.names'
data, attributes, value_type = read(data_path, scheme_path)
#random.shuffle(data)
df = pre_process(data, attributes, value_type)


Course_instructor : {'15': 1, '20': 2, '9': 3, '24': 4, '5': 5, '17': 6, '18': 7, '14': 8, '8': 9, '7': 10, '11': 11, '25': 12, '13': 13, '4': 14, '2': 15, '19': 16, '22': 17, '23': 18, '3': 19, '16': 20, '21': 21, '6': 22, '10': 23, '12': 24, '1': 25}
Course : {'15': 1, '20': 2, '9': 3, '24': 4, '5': 5, '17': 6, '14': 7, '18': 8, '8': 9, '7': 10, '11': 11, '25': 12, '13': 13, '4': 14, '2': 15, '19': 16, '22': 17, '23': 18, '3': 19, '16': 20, '21': 21, '6': 22, '10': 23, '26': 24, '12': 25, '1': 26}
 Class_size : [24.0, 45.0]


In [None]:
df = pd.DataFrame(df)

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,1,18,19,1,1,3
1,2,1,19,1,1,3
2,1,18,19,2,3,3
3,1,5,15,2,2,3
4,2,10,11,2,3,3


In [None]:
df[5].unique()

array(['3', '2', '1'], dtype=object)

In [None]:
X = df.drop(5, axis=1)
y = df[5]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=90)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=90)

In [None]:
pred_test = rf_clf.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, pred_test, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, pred_test, average = 'micro')}\n")

Test Result:
Accuracy Score: 60.00%
_______________________________________________
CLASSIFICATION REPORT:
                   1          2          3  accuracy  macro avg  weighted avg
precision   0.500000   0.588235   0.888889       0.6   0.659041      0.660458
recall      0.705882   0.625000   0.470588       0.6   0.600490      0.600000
f1-score    0.585366   0.606061   0.615385       0.6   0.602270      0.602195
support    17.000000  16.000000  17.000000       0.6  50.000000     50.000000
_______________________________________________
Confusion Matrix: 
 [[12  5  0]
 [ 5 10  1]
 [ 7  2  8]]

_______________________________________________
f1_score: 
 0.6



### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_lib = LogisticRegression(solver='liblinear') #good choice for smaller dataset
lr_lib_fit = lr_lib.fit(X_train,y_train)
lr_lib_fit

LogisticRegression(solver='liblinear')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# Didn't tune the hyperparameter
lr_lib_pred_test = lr_lib_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_lib_pred_test, output_dict=True))
print("Test Result for liblinear solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_lib_pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_lib_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_lib_pred_test, average = 'micro')}\n")

Test Result for liblinear solver:
Accuracy Score: 36.00%
_______________________________________________
CLASSIFICATION REPORT:
                   1          2          3  accuracy  macro avg  weighted avg
precision   0.250000   0.388889   0.437500      0.36   0.358796      0.358194
recall      0.235294   0.437500   0.411765      0.36   0.361520      0.360000
f1-score    0.242424   0.411765   0.424242      0.36   0.359477      0.358431
support    17.000000  16.000000  17.000000      0.36  50.000000     50.000000
_______________________________________________
Confusion Matrix: 
 [[4 7 6]
 [6 7 3]
 [6 4 7]]

_______________________________________________
f1_score: 
 0.36



In [None]:
lr_saga = LogisticRegression(solver='saga') # Used for multi class problem
lr_saga_fit = lr_saga.fit(X_train,y_train)
lr_saga_fit



LogisticRegression(solver='saga')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# Didn't tune the hyperparameter
lr_saga_pred_test = lr_saga_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_saga_pred_test, output_dict=True))
print("Test Result for saga solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_saga_pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_saga_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_saga_pred_test, average = 'micro')}\n")

Test Result for saga solver:
Accuracy Score: 32.00%
_______________________________________________
CLASSIFICATION REPORT:
                   1          2          3  accuracy  macro avg  weighted avg
precision   0.187500   0.411765   0.352941      0.32   0.317402      0.315515
recall      0.176471   0.437500   0.352941      0.32   0.322304      0.320000
f1-score    0.181818   0.424242   0.352941      0.32   0.319667      0.317576
support    17.000000  16.000000  17.000000      0.32  50.000000     50.000000
_______________________________________________
Confusion Matrix: 
 [[3 6 8]
 [6 7 3]
 [7 4 6]]

_______________________________________________
f1_score: 
 0.32



## Breast Cancer Coimbra Data Set.csv

In [None]:
data = pd.read_csv('../Dataset/Breast Cancer Coimbra Data Set.csv')
data = data.values.tolist()
for i in range(len(data)):
    data[i][-1] = str(data[i][-1])
attributes = ['Age','BMI','Glucose','Insulin','HOMA','Leptin','Adiponectin','Resistin	MCP.1','Classification']
value_type = ['numerical','numerical','numerical','numerical','numerical','numerical','numerical','numerical','label']

d = pre_process(data, attributes, value_type)
data = pd.DataFrame(d)


Age : [38.0]
BMI : [25.106252846666667, 31.842505693333337]
Glucose : [92.0]
Insulin : [21.107999999999997, 39.784]
HOMA : [2.241625267]
Leptin : [32.96733333333333, 61.623666666666665]
Adiponectin : [13.784013333333334, 25.91200666666667]
Resistin	MCP.1 : [29.506666666666668, 55.803333333333335]


In [None]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2,1,1,1,1,1,1,1,468.786,1.0
1,2,1,1,1,1,1,2,1,554.697,1.0
2,2,1,1,1,1,1,1,1,928.22,1.0
3,2,1,1,1,1,1,1,1,773.92,1.0
4,2,1,1,1,1,1,1,1,530.41,1.0


In [None]:
X = data.drop(9, axis=1)
y = data[9]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=90)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=90)

In [None]:
# Predict the class label using X_test
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
pred_test = rf_clf.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, pred_test, output_dict=True))
print("Test Result:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, pred_test, average = 'micro')}\n")

Test Result:
Accuracy Score: 68.42%
_______________________________________________
CLASSIFICATION REPORT:
                 1.0   2.0  accuracy  macro avg  weighted avg
precision   0.666667   0.7  0.684211   0.683333      0.684211
recall      0.666667   0.7  0.684211   0.683333      0.684211
f1-score    0.666667   0.7  0.684211   0.683333      0.684211
support    18.000000  20.0  0.684211  38.000000     38.000000
_______________________________________________
Confusion Matrix: 
 [[12  6]
 [ 6 14]]

_______________________________________________
f1_score: 
 0.6842105263157895



### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_lib = LogisticRegression(solver='liblinear') #good choice for smaller dataset
lr_lib_fit = lr_lib.fit(X_train,y_train)
lr_lib_fit

LogisticRegression(solver='liblinear')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
lr_lib_pred_test = lr_lib_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_lib_pred_test, output_dict=True))
print("Test Result for liblinear solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_lib_pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_lib_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_lib_pred_test, average = 'micro')}\n")

Test Result for liblinear solver:
Accuracy Score: 55.26%
_______________________________________________
CLASSIFICATION REPORT:
                 1.0        2.0  accuracy  macro avg  weighted avg
precision   0.533333   0.565217  0.552632   0.549275      0.550114
recall      0.444444   0.650000  0.552632   0.547222      0.552632
f1-score    0.484848   0.604651  0.552632   0.544750      0.547903
support    18.000000  20.000000  0.552632  38.000000     38.000000
_______________________________________________
Confusion Matrix: 
 [[ 8 10]
 [ 7 13]]

_______________________________________________
f1_score: 
 0.5526315789473685



In [None]:
lr_saga = LogisticRegression(solver='saga') # Used for multi class problem
lr_saga_fit = lr_saga.fit(X_train,y_train)
lr_saga_fit



LogisticRegression(solver='saga')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
lr_saga_pred_test = lr_saga_fit.predict(X_test)
clf_report_test = pd.DataFrame(classification_report(y_test, lr_saga_pred_test, output_dict=True))
print("Test Result for saga solver:\n================================================")        
print(f"Accuracy Score: {accuracy_score(y_test, lr_saga_pred_test) * 100:.2f}%")
print("_______________________________________________")
print(f"CLASSIFICATION REPORT:\n{clf_report_test}")
print("_______________________________________________")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, lr_saga_pred_test)}\n")
print("_______________________________________________")
print(f"f1_score: \n {f1_score(y_test, lr_saga_pred_test, average = 'micro')}\n")

Test Result for saga solver:
Accuracy Score: 52.63%
_______________________________________________
CLASSIFICATION REPORT:
            1.0        2.0  accuracy  macro avg  weighted avg
precision   0.0   0.526316  0.526316   0.263158      0.277008
recall      0.0   1.000000  0.526316   0.500000      0.526316
f1-score    0.0   0.689655  0.526316   0.344828      0.362976
support    18.0  20.000000  0.526316  38.000000     38.000000
_______________________________________________
Confusion Matrix: 
 [[ 0 18]
 [ 0 20]]

_______________________________________________
f1_score: 
 0.5263157894736842



  _warn_prf(average, modifier, msg_start, len(result))
