# Glass

## reference

In [115]:
# https://www.kaggle.com/datasets/uciml/glass/code

# Attribute Information:

#     Id number: 1 to 214 (removed from CSV file)
#     RI: refractive index
#     Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
#     Mg: Magnesium
#     Al: Aluminum
#     Si: Silicon
#     K: Potassium
#     Ca: Calcium
#     Ba: Barium
#     Fe: Iron
#     Type of glass: (class attribute)
#     -- 1 building_windows_float_processed
#     -- 2 building_windows_non_float_processed
#     -- 3 vehicle_windows_float_processed
#     -- 4 vehicle_windows_non_float_processed (none in this database)
#     -- 5 containers
#     -- 6 tableware
#     -- 7 headlamps


## Import Libraries

In [116]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample

## Import Dataset

In [117]:
path = 'https://raw.githubusercontent.com/notfakearcher/julian/main/02_data/glass.csv'
glass = pd.read_csv(path)
glass.head()

Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [118]:
glass.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      214 non-null    int64  
 1   RI      214 non-null    float64
 2   Na      214 non-null    float64
 3   Mg      214 non-null    float64
 4   Al      214 non-null    float64
 5   Si      214 non-null    float64
 6   K       214 non-null    float64
 7   Ca      214 non-null    float64
 8   Ba      214 non-null    float64
 9   Fe      214 non-null    float64
 10  Type    214 non-null    int64  
dtypes: float64(9), int64(2)
memory usage: 18.5 KB


## check target labels distribution

In [119]:
num_type = glass.Type.value_counts()
num_type

2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64

In [151]:
add_num

1     6
2     0
3    59
5    63
6    67
7    47
Name: Type, dtype: int64

## resample 

In [168]:
glass_2 = glass.copy()
# find the max number of type of target 
num_max = glass.Type.value_counts().max()
# find how many samples do we have to add for each type 
add_num = num_max - num_type
add_num = add_num.sort_index()
# find the sample for each type
for i in add_num.index:
  cond = glass.Type == i
  sample_i = glass[cond]
  # resample from sample for type
  upsample = resample(sample_i, replace = True, n_samples = add_num[i])
  # join the two data together
  glass_2 = pd.concat([glass_2, upsample], axis = 0)
glass_2

Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00,1
1,2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,1
...,...,...,...,...,...,...,...,...,...,...,...
209,210,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,7
209,210,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,7
205,206,1.51732,14.95,0.00,1.80,72.99,0.00,8.61,1.55,0.00,7
191,192,1.51602,14.85,0.00,2.38,73.28,0.00,8.76,0.64,0.09,7
