In [None]:

##Mohsin
#Assignment 12 Glass 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
# Load data
data = pd.read_csv("glass.csv")




In [None]:
data.isnull().sum()

In [None]:
data.dtypes

In [None]:
data.describe()

From the statistical summary, we can observe the following insights:

The dataset contains 214 samples/observations.
The mean value of RI (refractive index) is around 1.52.
The mean value of Ca (calcium) is the highest among all the features.
The standard deviation (std) is high for K (potassium), Ca, and Ba (barium) features.
The minimum and maximum values for each feature are also listed, which gives an idea of the range of values the features can take.
The quartiles (25%, 50%, and 75%) give an idea about the distribution of the values. For example, the median value (50%) of RI is 1.51768, which indicates that half of the observations have RI less than 1.51768, and the other half have RI greater than this value.

In [None]:
data.info()

In [None]:
#Pairplot of all variables by glass type:

sns.pairplot(data, hue='Type')
plt.suptitle('Pairplot of All Variables by Glass Type', y=1.02)
plt.show()


The pairplot shows the relationships between all pairs of variables, with each point colored by the glass type. Some observations from the pairplot:

Glass types 1, 2, and 3 have similar ranges and distributions for most variables, with type 3 having slightly higher magnesium and lower aluminum values.
Glass types 5 and 6 have much higher potassium values and lower aluminum and calcium values compared to other types.
There is some separation between the types for certain variables, such as refractive index and magnesium content.
There are some strong correlations between variables, such as between refractive index and calcium content, and between aluminum and magnesium content.

In [None]:
#Heatmap
corr_matrix = data.corr()
sns.heatmap(corr_matrix, cmap='coolwarm', annot=True)
plt.title('Heatmap of Correlation Matrix')
plt.show()


The heatmap shows the correlation coefficients between pairs of variables, with darker shades indicating stronger positive or negative correlations. Some observations from the heatmap:

Refractive index is strongly positively correlated with calcium content and moderately positively correlated with barium content.
Aluminum and magnesium content are strongly negatively correlated.
Potassium content is strongly negatively correlated with calcium and barium content.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x='Type', y='RI', data=data)
plt.title('Boxplot of Refractive Index by Glass Type')
plt.show()



In [None]:
#Observing the Outliers

fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(15,10))
index = 0
for k,v in data.items():
    sns.boxplot(y=k, data=data, ax=axs[index//4, index%4])
    index += 1
plt.tight_layout()
plt.show()



In [None]:
# Removing outliers

Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

data = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]


This code removes the outliers from the dataset using the IQR (Interquartile range) method.

First, the 25th and 75th percentiles (Q1 and Q3) of each column of the dataset are calculated using the quantile() method.
Then, the IQR is calculated as the difference between Q3 and Q1.
The outliers are identified as the data points that are below (Q1 - 1.5IQR) or above (Q3 + 1.5IQR) for each column.
Finally, the any() method is used to find the rows that contain any outlier in any column, and the ~ operator is used to select the rows that do not have any outliers. These rows are then stored back in the data variable, effectively removing the outliers from the dataset.

In [None]:
data.describe()

After removing the outlier, the dataset contains 136 observations instead of 214.

The mean and standard deviation values for the various features have also changed. For example, the mean value of RI has increased slightly, while the mean value of Mg has increased significantly. The standard deviation values for most features have decreased, indicating that the data points are more tightly clustered around the mean.

The maximum value for Ba is now 0, indicating that there are no observations with high Ba values in the dataset after removing the outlier.

Overall, the dataset now appears to be more representative of the majority of the glass samples, as the outlier was likely an anomaly that was skewing the data.

In [None]:
# Prepare data for KNN
X = data.drop('Type', axis=1)
y = data['Type']

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)



In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Using Hyperparameter Tuning 

In [None]:


# Fit KNN model and evaluate performance
k_values = [1, 3, 5, 7, 9, 11, 13, 15]
weights = ['uniform', 'distance']
best_accuracy = 0
best_k = 0
best_weight = ''

for k in k_values:
    for w in weights:
        knn = KNeighborsClassifier(n_neighbors=k, weights=w)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_k = k
            best_weight = w
            
print(f"Best accuracy: {best_accuracy:.2f} with k={best_k} and weights='{best_weight}'")

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create a new instance of KNeighborsClassifier with the best hyperparameters
knn = KNeighborsClassifier(n_neighbors=1, weights='uniform')

# Fit the model to the training data
knn.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train KNN with the best hyperparameters
knn = KNeighborsClassifier(n_neighbors=1, weights='uniform')
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

# Calculate classification report
report = classification_report(y_test, y_pred)
print("Classification report:\n", report)


The KNN model has achieved an accuracy of 78.57% on the glass classification task, which is a relatively good performance. The confusion matrix shows that the model has correctly identified most of the glass types, with only a few misclassifications. The classification report provides a detailed breakdown of the model's performance on each class, including precision, recall, and f1-score. Overall, this is a positive outcome that suggests the KNN model can be used effectively for glass classification.