CLUSTERING
Cluster passengers into groups of ticket class (1, 2, 3)

DATA PREPARATION

In [53]:
import numpy as np  
from numpy import random
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.model_selection import train_test_split
import math

In [35]:
import os
os.environ["OMP_NUM_THREADS"] = '4'
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('Dataset/train_preprocessing.csv')
test = pd.read_csv('Dataset/test_preprocessing.csv')
gender_submission = pd.read_csv('Dataset/gender_submission.csv')
# Merge the test dataframe with the gender_submission dataframe on 'PassengerId'
test = pd.merge(test, gender_submission[['PassengerId', 'Survived']], on='PassengerId', how='left')
test = test.drop(['Survived_x'], axis = 1)
test.rename(columns={'Survived_y': 'Survived'}, inplace=True)
print(test.head())
train = train[train['Survived'] != 'U']
test = test[test['Survived'] != 'U']
train['Survived'] = train['Survived'].astype(float)
test['Survived'] = test['Survived'].astype(float)
validation, test = train_test_split(test, test_size=2/3, random_state=42)

   Unnamed: 0  index  PassengerId  Pclass  \
0         893      2          894       2   
1         894      3          895       3   
2         895      4          896       3   
3         896      5          897       3   
4         897      6          898       3   

                                           Name  Sex   Age  SibSp  Parch  \
0                     Myles, Mr. Thomas Francis    1  62.0      0      0   
1                              Wirz, Mr. Albert    1  27.0      0      0   
2  Hirvonen, Mrs. Alexander (Helga E Lindqvist)    0  22.0      1      1   
3                    Svensson, Mr. Johan Cervin    1  14.0      0      0   
4                          Connolly, Miss. Kate    0  30.0      0      0   

    Ticket     Fare  Cabin  Embarked  Title  Survived  
0   240276   9.6875      8         1      2         0  
1   315154   8.6625      8         2      2         0  
2  3101298  12.2875      8         2      3         1  
3     7538   9.2250      8         2      2     

In [None]:
X_train = train[['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Title']].values
X_train

array([[ 0.,  1., 22., ...,  8.,  2.,  2.],
       [ 1.,  0., 38., ...,  2.,  0.,  3.],
       [ 1.,  0., 26., ...,  8.,  2.,  1.],
       ...,
       [ 0.,  0., 18., ...,  8.,  2.,  1.],
       [ 1.,  1., 26., ...,  2.,  0.,  2.],
       [ 0.,  1., 32., ...,  8.,  1.,  2.]])

K-Means - Scikit Learn

In [None]:
k = 3
model = cluster.KMeans(n_clusters=k, init='random')
model.fit(X_train)

In [None]:
# Calculate and print the cardinality of each cluster
labels, counts = np.unique(model.labels_, return_counts=True)

# Visualize the cardinality of each cluster
plt.bar(labels, counts)
plt.xlabel('Cluster')
plt.ylabel('Number of Samples')
plt.title('Cardinality of Each Cluster')
plt.show()

# Calculate and print the sum-of-square error (SSE)
sse = model.inertia_
print(f'Sum-of-Square Error (SSE): {sse}')

# Calculate and print the silhouette score
silhouette_avg = silhouette_score(X_train, model.labels_)
print(f'Silhouette Score: {silhouette_avg}')

In [None]:
# Tuning to find the best number of clusters
sse = []
sil = []
min_sse = 1e9
min_sil = 1e9
k_values = range(2, 31)
for k in k_values:
    model = cluster.KMeans(n_clusters=k, init='random', n_init='auto')
    model.fit(X_train)
    sse.append(model.inertia_)
    min_sse = min(min_sse, sse[-1])
    sil.append(silhouette_score(X_train, model.labels_))
    min_sil = min(min_sil, sil[-1])
    

# Plot SSE against k
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(k_values, sse, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum-of-Square Error (SSE)')
plt.title('SSE')

# Plot silhouette score against k
plt.subplot(1, 2, 2)
plt.plot(k_values, sil, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score')

plt.tight_layout()
plt.show()
print(f"\t\tMin SSE: {min_sse} \t\t\t    Min Silhouette: {min_sil}")

KMeans - Scratch

In [99]:
class KMeans_Model():
    def __init__(self, k=3, d=2, max_iter=1000, e=1e-4):
        self.k = k
        self.d = d
        self.c = []
        self.label = []
        self.number_label = []
        self.max_iter = max_iter
        self.e = e
        self.inertia_ = None

    def fit(self, X):
        self.c = [0] * self.k
        self.label = [-1] * len(X)
        self.number_label = [0] * self.k
        # Initialization
        car, dim = X.shape
        for i in range(self.k):
            self.c[i] = self.random_center(dim)
        
        for m in range(self.max_iter):
            # Assign x to the cluster
            self.number_label = [0] * self.k
            for i in range(car):
                min_dist = 1e10
                min_c = -1
                for j in range(self.k):
                    dist = self.dist(self.c[j], X[i])
                    if min_dist > dist:
                        min_dist = dist
                        min_c = j
                self.label[i] = min_c
                self.number_label[min_c] += 1
                    
            # Recompute centroid:
            sum = [np.zeros(dim) for _ in range(self.k)]
            new_c = [np.zeros(dim) for _ in range(self.k)]
            for i in range(car):
                sum[self.label[i]] += X[i]
            for i in range(self.k):
                if self.number_label[i] > 0:
                    new_c[i] = sum[i] / self.number_label[i]
            # Stop criteria
            if self.has_converged(self.c, new_c, self.e):
                # Compute inertia
                sse = 0
                for l in range(car):
                    for g in range(k):
                        sse += self.dist(self.c[g], X[l])
                self.inertia_ = sse
                return
            else:
                self.c = new_c


    def has_converged(self, centroids, new_centroids, threshold=1e-4):
        for c, nc in zip(centroids, new_centroids):
            if np.linalg.norm(c - nc) > threshold:
                return False
        return True

    def dist(self, x, y):
        if self.d == 1:
            return np.linalg.norm(x - y, ord=1)
        elif self.d == 2:
            return np.linalg.norm(x - y, ord=2)
        else:
            return np.linalg.norm(x - y, ord=np.inf)
        
    def random_center(self, size):
        return np.random.uniform(size=size)

In [103]:
model = KMeans_Model()
model.fit(X_train)
print(model.number_label)
print(model.inertia_)

[array([0.51483003, 0.89509763, 0.36460229, 0.72586584, 0.81292147,
       0.5465937 , 0.99441687, 0.40947778, 0.15280912]), array([0.72016054, 0.69683702, 0.5173287 , 0.54412377, 0.88548634,
       0.95232537, 0.86308263, 0.59711822, 0.36878434]), array([0.86681617, 0.21281614, 0.772139  , 0.20203204, 0.206756  ,
       0.24055571, 0.13405973, 0.62464848, 0.31031668])]
[729, 20, 140]
305911.18737466907
