In [61]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [62]:
# Read in the data
beijing_data = pd.read_csv("Beijing_labeled.csv")
chengdu_data = pd.read_csv('Chengdu_labeled.csv')
guangzhou_data = pd.read_csv('Guangzhou_labeled.csv')
shanghai_data = pd.read_csv('Shanghai_labeled.csv')
shenyang_data = pd.read_csv('Shenyang_labeled.csv')

In [63]:
X_train = beijing_data.drop('PM_HIGH', axis=1)
y_train = beijing_data['PM_HIGH'].astype(int)

X_val = shenyang_data.drop('PM_HIGH', axis=1)
y_val = shenyang_data['PM_HIGH'].astype(int)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)  # Use the same scaler to transform validation data


In [64]:
class Classifier:
    def __init__(self, n_clusters=2):
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=0)
        self.cluster_labels = None

    def fit(self, X, y):
        # Fit the classifier
        self.kmeans.fit(X)
        # Assign labels to clusters based on majority vote
        self.cluster_labels = np.zeros(self.kmeans.n_clusters, dtype=int)
        for i in range(self.kmeans.n_clusters):
            # Find the index of points in cluster i
            indexes = np.where(self.kmeans.labels_ == i)[0]
            # Assign the majority label to the cluster
            cluster_label = np.bincount(y[indexes]).argmax()
            self.cluster_labels[i] = cluster_label

    def predict(self, X):
        # Predict the labels
        cluster_indexes = self.kmeans.predict(X)
        return np.array([self.cluster_labels[i] for i in cluster_indexes])
    
    def score(self, X, y):
        # Score the classifier
        predictions = self.predict(X)
        return np.mean(predictions == y)

In [71]:
classifier = Classifier(n_clusters=20)  # Consider experimenting with the number of clusters
classifier.fit(X_train_scaled, y_train)
print("Training Accuracy: ", classifier.score(X_train_scaled, y_train))
print("Validation Accuracy: ", classifier.score(X_val_scaled, y_val))



Training Accuracy:  0.7349106711733462
Validation Accuracy:  0.7342709104367136


In [72]:
# Score for the other cities
X_val = chengdu_data.drop('PM_HIGH', axis=1)
y_val = chengdu_data['PM_HIGH'].astype(int)
X_val_scaled = scaler.transform(X_val)
print("Chengdu Accuracy: ", classifier.score(X_val_scaled, y_val))

X_val = guangzhou_data.drop('PM_HIGH', axis=1)
y_val = guangzhou_data['PM_HIGH'].astype(int)
X_val_scaled = scaler.transform(X_val)
print("Guangzhou Accuracy: ", classifier.score(X_val_scaled, y_val))

X_val = shanghai_data.drop('PM_HIGH', axis=1)
y_val = shanghai_data['PM_HIGH'].astype(int)
X_val_scaled = scaler.transform(X_val)
print("Shanghai Accuracy: ", classifier.score(X_val_scaled, y_val))

Chengdu Accuracy:  0.7099099099099099
Guangzhou Accuracy:  0.7707100591715976
Shanghai Accuracy:  0.7342709104367136
