In [1]:
import pandas as pd
import numpy as np
from typing import Any
try:
    from typing import Self
except ImportError:
    from typing_extensions import Self

In [2]:
def array_to_str(arr):
    if arr.ndim == 1:
        return ','.join(map(str, np.around(arr, 4).tolist()))
    elif arr.ndim == 2:
        return '\n'.join((','.join(map(str, x)) for x in np.around(arr, 4).tolist()))
    else:
        raise ValueError('Only 1 and 2-d arrays suppported')

def write(file: str, data: Any):
    with open(file, 'w') as f:
        f.write(data if isinstance(data, str) else str(data))

def rescale(x: np.ndarray) -> np.ndarray:
    min_ = np.min(x)
    range_ = np.ptp(x)
    return (x - min_) / range_

In [3]:
df = pd.read_csv('time_series_covid19_deaths_US_raw.csv')
raw_data = df.to_numpy()

In [4]:
cum_data = np.cumsum(raw_data, axis=0)
data = np.diff(cum_data)

del raw_data

In [5]:
wisc_cal_data = cum_data[(4, 48), :]
q1_str = array_to_str(wisc_cal_data)
write('q1.txt', q1_str)

del cum_data
del q1_str

In [6]:
q2_data = data[(4, 48), :] # 4-> cal, 48 -> wisco
q2_str = array_to_str(q2_data)
write('q2.txt', q2_str)

del q2_data
del q2_str

In [7]:
write('q3.txt', '''Mean of the time-differenced data
Standard deviation of the time-differenced data
Median of the time-differenced data
Linear trend coefficient of the data
Auto-correlation of the data''')

In [8]:
mean = np.mean(data, axis=1)
mean = rescale(mean)

In [9]:
std = np.std(data, axis=1)
std = rescale(std)

In [10]:
median = np.median(data, axis=1)
median = rescale(median)

In [11]:
ltc = np.array([sum((x[i] - mean[idx]) * (i + 1 - (half_length := ((data.shape[-1] + 1) / 2))) for i in range(len(x))) for idx, x in enumerate(data.tolist())]) / sum((i+1 - half_length) ** 2 for i in range(data.shape[-1]))
ltc = rescale(ltc)

In [12]:
ac = np.array([ sum((x[i] * x[i-1] for i in range(1, len(x)))) / np.sum(np.power(x, 2)) for x in (data - mean.reshape(50, -1)).tolist()])
ac = rescale(ac)

In [13]:
parametric_data = np.empty((50, 5))
parametric_data[:, 0] = mean
parametric_data[:, 1] = std
parametric_data[:, 2] = median
parametric_data[:, 3] = ltc
parametric_data[:, 4] = ac

parametric_data
q4_str = array_to_str(parametric_data)
write('q4.txt', q4_str)
del q4_str

In [14]:
class HierarchicalClustering:
    class Clustering:
        @staticmethod
        def _compute_distances(X: np.ndarray) -> np.ndarray:
            n = X.shape[0]
            distance_matrix = np.zeros((n, n))
        
            for i in range(n):
                for j in range(i + 1, n):
                    distance_matrix[i, j] = np.linalg.norm(X[i] - X[j])
        
            distance_matrix += distance_matrix.T
        
            return distance_matrix
        
        @staticmethod
        def _single(distance_matrix: np.ndarray, num_clusters: int) -> list:
            num_points = distance_matrix.shape[0]
            clusters = [[i] for i in range(num_points)]
        
            while len(clusters) > num_clusters:
                min_distance = np.inf
                merge_index1, merge_index2 = None, None
        
                for i in range(len(clusters)):
                    for j in range(i + 1, len(clusters)):
                        cluster1 = clusters[i]
                        cluster2 = clusters[j]
        
                        distance = np.min(distance_matrix[np.ix_(cluster1, cluster2)])
        
                        if distance < min_distance:
                            min_distance = distance
                            merge_index1, merge_index2 = i, j
        
                # Merge the two closest clusters
                clusters[merge_index1].extend(clusters[merge_index2])
                del clusters[merge_index2]
        
                combined_cluster = clusters[merge_index1]
                for i in range(distance_matrix.shape[0]):
                    if i in combined_cluster:
                        continue
                    distance = np.min(distance_matrix[np.ix_(combined_cluster, [i])])
                    distance_matrix[i, combined_cluster] = distance
                    distance_matrix[combined_cluster, i] = distance
        
            return tuple(map(tuple, clusters))
    
        @staticmethod
        def _complete(distance_matrix: np.ndarray, n_clusters: int) -> list:
            num_points = distance_matrix.shape[0]
            clusters = [[i] for i in range(num_points)]
        
            while len(clusters) > n_clusters:
                min_max_distance = np.inf
                merge_index1, merge_index2 = None, None
        
                for i in range(len(clusters)):
                    for j in range(i + 1, len(clusters)):
                        cluster1 = clusters[i]
                        cluster2 = clusters[j]
        
                        max_distance = np.max(distance_matrix[np.ix_(cluster1, cluster2)])
        
                        if max_distance < min_max_distance:
                            min_max_distance = max_distance
                            merge_index1, merge_index2 = i, j
        
                clusters[merge_index1].extend(clusters[merge_index2])
                del clusters[merge_index2]
        
                combined_cluster = clusters[merge_index1]
                for i in range(distance_matrix.shape[0]):
                    if i in combined_cluster:
                        continue
                    distance = np.max(distance_matrix[np.ix_(combined_cluster, [i])])
                    distance_matrix[i, combined_cluster] = distance
                    distance_matrix[combined_cluster, i] = distance
        
            return clusters
        
    _METHODS = {
        'single': Clustering._single,
        'complete': Clustering._complete,
    }

    def __init__(self, n_clusters: int = 2, linkage: str = 'single'):
        self._n_clusters = n_clusters
        
        assert HierarchicalClustering._is_linkage_valid(linkage), (
            f'Invalid Linkage, valid linkages are \'single\' and '
            f'\'complete\', found {linkage!r}'
        )
        
        self._linkage = linkage

    def _build(self, X: np.ndarray):
        distance_matrix = HierarchicalClustering.Clustering._compute_distances(X)
        cluster_func = HierarchicalClustering._METHODS.get(self._linkage)
        if cluster_func is None:
            raise ValueError(
                f'Invalid Linkage, the Hierarchical Clustering instance '
                f'may have been tampered with. found {self.linkage!r}'
            )

        self._clusters = cluster_func(distance_matrix, self._n_clusters)

    def fit(self, X: np.ndarray) -> Self:
        self._build(X)
        return self

    def predict(self, X: np.ndarray):
        pass
    
    @property
    def clusters(self):
        return self._clusters

    @clusters.setter
    def clusters(self, value):
        raise ValueError('The `clusters` attribute should not be modified externally!')
    
    @staticmethod
    def _is_linkage_valid(linkage: str) -> bool:
        return linkage in HierarchicalClustering._METHODS.keys()

In [15]:
slhc = HierarchicalClustering(n_clusters = 5)
slhc = slhc.fit(parametric_data)

idxs = np.arange(parametric_data.shape[0])

for cluster_idx, cluster in enumerate(slhc.clusters):
    for item in cluster:
        idxs[item] = cluster_idx

q5_str = array_to_str(idxs)
write('q5.txt', q5_str)
del q5_str

In [16]:
clhc = HierarchicalClustering(n_clusters = 5, linkage='complete')
clhc = clhc.fit(parametric_data)

idxs = np.arange(parametric_data.shape[0])

for cluster_idx, cluster in enumerate(clhc.clusters):
    for item in cluster:
        idxs[item] = cluster_idx

q6_str = array_to_str(idxs)
write('q6.txt', q6_str)
del q6_str

In [17]:
class KMeans:
    class Clustering:
        @staticmethod
        def assign_points(centroids: np.ndarray, X: np.ndarray) -> np.ndarray:
            distances = KMeans.Clustering.compute_distances(centroids, X)

            return np.argmin(distances, axis=1)
        
        @staticmethod
        def choose_centroids(X: np.ndarray, k: int = 2) -> np.ndarray:
            return np.random.default_rng().choice(
                X,
                size=k,
                replace=False,
                shuffle=False
            )

        @staticmethod
        def compute_centroids(X: np.ndarray, assignments: np.ndarray, k: int) -> np.ndarray:
            centroids = np.empty((k, X.shape[1]))
            for i in range(k):
                idxs = assignments == i
                centroids[i] = np.mean(X[idxs], axis=0)
            return centroids
        
        @staticmethod
        def compute_distances(centroids: np.ndarray, X: np.ndarray) -> np.ndarray:
            return np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
    
    def __init__(self, k: int = 2, max_iters: int = 100, tol: float = 1e-5, ):
        self._k = k
        self._max_iters = max_iters
        self._tol = abs(tol)
    
    def _build(self, X: np.ndarray):
        self._centroids = KMeans.Clustering.choose_centroids(X, self.k)

        old_centroids = np.empty_like(self._centroids)
        for _ in range(self._max_iters):
            np.copyto(old_centroids, self._centroids)
            
            assignments = KMeans.Clustering.assign_points(self._centroids, X)

            self._centroids = KMeans.Clustering.compute_centroids(X, assignments, self.k)

            if np.all(np.abs(self.centroids - old_centroids) < self._tol):
                self._converged_in = _
                break
    
    def fit(self, X: np.ndarray) -> Self:
        self._build(X)
        return self
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        return KMeans.Clustering.assign_points(self._centroids, X)

    def distortion(self, X: np.ndarray) -> np.ndarray:
        assignments = self.predict(X)
        _distortion = 0
        
        for i in range(centroids.shape[0]):
            idxs = assignments == i
            _distortion += np.sum((X[idxs] - self._centroids[i]) ** 2)
    
        return _distortion
    
    @property
    def centroids(self):
        return self._centroids
    
    @centroids.setter
    def centroids(self):
        raise ValueError('The `centroids` attribute should not be modified externally!')
    
    @property
    def k(self):
        return self._k
    
    @k.setter
    def k(self):
        raise ValueError('The `k` attribute should not be modified externally!')

In [18]:
kmeans = KMeans(k=5)
kmeans.fit(parametric_data)

<__main__.KMeans at 0x2b222455690>

In [19]:
predictions = kmeans.predict(parametric_data)
print(predictions)

q7_str = array_to_str(predictions)
write('q7.txt', q7_str)

[1 1 1 1 4 4 4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3]


In [20]:
centroids = kmeans.centroids
print(centroids)

q8_str = array_to_str(centroids)
write('q8.txt', q8_str)

[[0.3841545  0.4013964  0.40084187 0.65194206 0.78788545]
 [0.01910688 0.0263451  0.01608292 0.98534334 0.09319708]
 [0.53421437 0.53750169 0.5568263  0.51533993 0.85327648]
 [0.84635037 0.84669477 0.85448253 0.09738579 0.97692389]
 [0.21272193 0.24230699 0.22399929 0.80436664 0.73502808]]


In [21]:
distortion = kmeans.distortion(parametric_data)
print(distortion)

q9_str = str(np.around(distortion, 4))
write('q9.txt', q9_str)

1.2217977243933051


In [22]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, init='random', n_init='auto')
kmeans.fit(parametric_data)

print('kmeans.labels_', array_to_str(kmeans.predict(parametric_data)), sep='\n')
print('kmeans.cluster_centers_', array_to_str(kmeans.cluster_centers_), sep='\n')
print('kmeans.inertia_', round(kmeans.inertia_, 4), sep='\n')

kmeans.labels_
4,4,4,4,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3
kmeans.cluster_centers_
0.5008,0.5065,0.5228,0.5458,0.8445
0.2685,0.295,0.2807,0.755,0.7447
0.7589,0.7591,0.768,0.1662,0.9635
0.9666,0.9672,0.9734,0.0028,0.9954
0.0191,0.0263,0.0161,0.9853,0.0932
kmeans.inertia_
0.8499


In [23]:
# Number of iterations required to converge
kmeans.n_iter_

7

In [24]:
parametric_data

array([[0.        , 0.        , 0.        , 0.9986107 , 0.        ],
       [0.00135718, 0.00114574, 0.00285919, 1.        , 0.03001297],
       [0.03158952, 0.04806661, 0.02287348, 0.97337978, 0.05936597],
       [0.0434808 , 0.05616804, 0.038599  , 0.96938287, 0.28340939],
       [0.1358702 , 0.17360164, 0.13080772, 0.86019578, 0.77532947],
       [0.14882183, 0.17734009, 0.15010722, 0.8482823 , 0.80630675],
       [0.15998246, 0.1848012 , 0.16726233, 0.82087323, 0.83911766],
       [0.1630183 , 0.18659993, 0.17441029, 0.81817872, 0.84661001],
       [0.24233917, 0.271905  , 0.26161544, 0.7920042 , 0.61883036],
       [0.28114474, 0.31229606, 0.30092924, 0.76432497, 0.66077738],
       [0.28282614, 0.31355672, 0.30235883, 0.76617046, 0.66353116],
       [0.28777262, 0.31835526, 0.30450322, 0.76490343, 0.66972189],
       [0.32567128, 0.34922068, 0.34381701, 0.70054802, 0.7175366 ],
       [0.34952234, 0.37023244, 0.3645461 , 0.67713332, 0.75670943],
       [0.35931758, 0.38041995, 0.

In [25]:
parametric_data[kmeans.labels_==0]

array([[0.40220838, 0.41797965, 0.41887062, 0.63807578, 0.80524818],
       [0.40488255, 0.4200107 , 0.42101501, 0.64167088, 0.80938536],
       [0.41999233, 0.4305089 , 0.43959971, 0.62234024, 0.82738159],
       [0.44221587, 0.44811561, 0.47105075, 0.58586659, 0.84766576],
       [0.48076206, 0.48191522, 0.50893495, 0.56558417, 0.83349638],
       [0.49434297, 0.49406205, 0.51822731, 0.55705013, 0.83909093],
       [0.5065539 , 0.50718339, 0.53109364, 0.53896924, 0.84496258],
       [0.52744127, 0.53178948, 0.54824875, 0.53523024, 0.84856725],
       [0.53077668, 0.53507067, 0.55468192, 0.53373388, 0.84907256],
       [0.53528477, 0.53972528, 0.5582559 , 0.5326905 , 0.852497  ],
       [0.54617324, 0.5520883 , 0.56754825, 0.52649688, 0.85067251],
       [0.5489159 , 0.55479058, 0.57112223, 0.52623001, 0.85228805],
       [0.58180872, 0.58543468, 0.60185847, 0.41930996, 0.87949568],
       [0.59008421, 0.5929572 , 0.60829164, 0.41810427, 0.88262189]])

In [26]:
parametric_data[kmeans.labels_==0].mean(axis=1)

array([0.53647652, 0.5393929 , 0.54796455, 0.55898292, 0.57413856,
       0.58055467, 0.58575255, 0.5982554 , 0.60066714, 0.60369069,
       0.60859584, 0.61066936, 0.6135815 , 0.61841184])