# Classifying Glass Types Using Random Forests

In [14]:
from math import sqrt, fabs, exp
import matplotlib.pyplot as plt
from sklearn.linear_model import enet_path
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split
from sklearn import ensemble
import numpy as np
import seaborn as sns

sns.set()

In [8]:
# arrange data into list for labels and list of lists for attributes
data_path = './data/glass.data'
x_list = []

with open(data_path) as data:
    
    for line in data:
        # split on comma
        row = line.strip().split(',')
        x_list.append(row)
        
glass_names = np.array(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type'])

# Separate attributes and labels
x_num = []
labels = []

for row in x_list:
    labels.append(row.pop())
    l = len(row)
    
    # eliminate ID
    attr_row = [float(row[i]) for i in range(1, l)]
    x_num.append(attr_row)
    
# number of rows and columns in x matrix
n_rows = len(x_num)
n_cols = len(x_num[1])

# labels and integers from 1 to 7 with no examples of 4
# gb requires consecutive integers starting at 0
new_labels = []
label_set = set(labels)
label_list = list(label_set)
label_list.sort()
n_labels = len(label_list)

for l in labels:
    index = label_list.index(l)
    new_labels.append(index)

In [9]:
# startified sampling by labels
x_temp = [x_num[i] for i in range(n_rows) if new_labels[i] == 0]
y_temp = [new_labels[i] for i in range(n_rows) if new_labels[i] == 0]
x_train, x_test, y_train, y_test = train_test_split(x_temp, y_temp, test_size=0.30, random_state=531)

for i_label in range(1, len(label_list)):
    # segregate x and y according to labels
    x_temp = [x_num[i] for i in range(n_rows) if new_labels[i] == i_label]
    y_temp = [new_labels[i] for i in range(n_rows) if new_labels[i] == i_label]
    
    # form train and test sets on segregated subset of examples
    x_train_temp, x_test_temp, y_train_temp, y_test_temp = train_test_split(x_temp, y_temp, test_size=0.30, 
                                                                            random_state=531)
    
    # accumulate
    x_train = np.append(x_train, x_train_temp, axis=0)
    x_test = np.append(x_test, x_test_temp, axis=0)
    y_train = np.append(y_train, y_train_temp, axis=0)
    y_test = np.append(y_test, y_test_temp, axis=0)
    
misclass_error = []
n_tree_list = range(50, 2000, 50)

for i_trees in n_tree_list:
    depth = None
    # try tweaking
    max_feat = 4
    glass_rf_model = ensemble.RandomForestClassifier(n_estimators=i_trees, max_depth=depth, max_features=max_feat,
                                                    oob_score=False, random_state=531)
    glass_rf_model.fit(x_train, y_train)
    
    # accumulate auc on test set
    prediction = glass_rf_model.predict(x_test)
    correct = accuracy_score(y_test, prediction)
    misclass_error.append(1.0 - correct)
    
print(f'Misclassification Error: {misclass_error[-1]}')

Misclassification Error: 0.2272727272727273


In [13]:
# generate confusion matrix
p_list = prediction.tolist()
confusion_mat = confusion_matrix(y_test, p_list)
print(f'Confusion Matrix:\n\n{confusion_mat}')

Confusion Matrix:

[[17  2  1  0  0  1]
 [ 2 18  1  2  0  0]
 [ 3  0  3  0  0  0]
 [ 0  0  0  4  0  0]
 [ 0  1  0  0  2  0]
 [ 0  2  0  0  0  7]]
