In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import average_precision_score, accuracy_score

In [2]:
data = pd.read_csv('credit_data_simulate.csv')

In [3]:
class CreditData:
    def __init__(self, data):
        self.data = data

    def split_data(self, test_size=0.2):
        return train_test_split(self.data, test_size=test_size)
    
    

    def k_fold(self, k=10):
        self.data = shuffle(self.data)
        
        X = self.data.iloc[:, :-1].values
        y = self.data.iloc[:, -1].values
        kf = KFold(n_splits=k, shuffle=False)
        folds = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            folds.append(((X_train, y_train), (X_test, y_test)))
        return folds
        
    def normalize(self, scaler):
        X = self.data.iloc[:, 0:-1]
        y = self.data.iloc[:, -1]
        X = scaler.fit_transform(X)
        return X, y
    
    def knn(self, X_train, y_train, X_test, y_test, k=5, metric='minkowski'):
        knn = KNeighborsClassifier(n_neighbors=k, metric=metric, p=2)
        if metric == 'seuclidean':
            V = np.var(X_train, axis=0)
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric, metric_params={'V': V})
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        y_pred_proba = knn.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
        cm = confusion_matrix(y_test, y_pred)
        return cm, y_pred_proba

    def plot_confusion_matrix(self, cm):
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title('Confusion matrix')
        plt.colorbar()
        plt.show()
    
    def get_metrics(self, y_test, y_pred_proba, cm):
        tn, fp, fn, tp = cm.ravel()
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)
        fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        auprc = average_precision_score(y_test, y_pred_proba)
        d_index = np.log2(1+accuracy) + np.log2(1 + ((recall + precision)/2))
        return [accuracy, precision, recall, f1, d_index, roc_auc, fpr, tpr, auprc]



In [None]:
credit_data = CreditData(data)
folds = credit_data.k_fold(k=10)
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler(), MaxAbsScaler()]
for scaler in scalers:
    print(scaler)
    for fold in folds:
        (X_train, y_train), (X_test, y_test) = fold
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        cm, y_pred_proba = credit_data.knn(X_train_scaled, y_train, X_test_scaled, y_test, k=5, metric='correlation')
        credit_data.plot_confusion_matrix(cm)
        metrics = credit_data.get_metrics(y_test, y_pred_proba, cm)
        print(metrics)
    
    # also do train test split
    X_train, X_test, y_train, y_test = credit_data.split_data()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    cm, y_pred_proba = credit_data.knn(X_train_scaled, y_train, X_test_scaled, y_test, k=5, metric='correlation')
    credit_data.plot_confusion_matrix(cm)
    metrics = credit_data.get_metrics(y_test, y_pred_proba, cm)
    print(metrics)


In [5]:
# import data and clean by removing all rows with missing values
data = pd.read_csv('PM2.5_Beijing_2010_2014.csv')
data = data.dropna()

#  save as clean_pm2.5_data.csv
data.to_csv('clean_pm2.5_data.csv', index=False)

In [6]:
# convert cbwd column to numerical values
data = pd.read_csv('clean_pm2.5_data.csv')
data['cbwd'] = data['cbwd'].astype('category')
data['cbwd'] = data['cbwd'].cat.codes
data.to_csv('clean_pm2.5_data.csv', index=False)

In [12]:
# cluster with bisecting kmeans
# split data by year
from sklearn.cluster import KMeans

data = pd.read_csv('clean_pm2.5_data.csv')
data = data.drop(columns=['No'])
data_2010 = data[data['year'] == 2010]
data_2011 = data[data['year'] == 2011]
data_2012 = data[data['year'] == 2012]
data_2013 = data[data['year'] == 2013]
data_2014 = data[data['year'] == 2014]

# cluster pm2.5 for each year

data_2010 = data_2010.drop(columns=['year', 'Is', 'Ir'])
data_2011 = data_2011.drop(columns=['year', 'Is', 'Ir'])
data_2012 = data_2012.drop(columns=['year', 'Is', 'Ir'])
data_2013 = data_2013.drop(columns=['year', 'Is', 'Ir'])
data_2014 = data_2014.drop(columns=['year', 'Is', 'Ir'])


data_2010_scaled = StandardScaler().fit_transform(data_2010)
data_2011_scaled = StandardScaler().fit_transform(data_2011)
data_2012_scaled = StandardScaler().fit_transform(data_2012)
data_2013_scaled = StandardScaler().fit_transform(data_2013)
data_2014_scaled = StandardScaler().fit_transform(data_2014)



In [16]:
data_2010_scaled = pd.DataFrame(data_2010_scaled, columns=data_2010.columns)
data_2011_scaled = pd.DataFrame(data_2011_scaled, columns=data_2011.columns)
data_2012_scaled = pd.DataFrame(data_2012_scaled, columns=data_2012.columns)
data_2013_scaled = pd.DataFrame(data_2013_scaled, columns=data_2013.columns)
data_2014_scaled = pd.DataFrame(data_2014_scaled, columns=data_2014.columns)

In [17]:
# y should be the pm2.5 column
data_2010_y = data_2010_scaled['pm2.5']
data_2011_y = data_2011_scaled['pm2.5']
data_2012_y = data_2012_scaled['pm2.5']
data_2013_y = data_2013_scaled['pm2.5']
data_2014_y = data_2014_scaled['pm2.5']

data_2010_X = data_2010_scaled.drop(columns=['pm2.5'])
data_2011_X = data_2011_scaled.drop(columns=['pm2.5'])
data_2012_X = data_2012_scaled.drop(columns=['pm2.5'])
data_2013_X = data_2013_scaled.drop(columns=['pm2.5'])
data_2014_X = data_2014_scaled.drop(columns=['pm2.5'])



In [30]:
# change y to categorical. should be three classes, low, medium, high based on quantiles
data_2010_y = pd.qcut(data_2010_y, q=3, labels=['low', 'medium', 'high'])
data_2011_y = pd.qcut(data_2011_y, q=3, labels=['low', 'medium', 'high'])
data_2012_y = pd.qcut(data_2012_y, q=3, labels=['low', 'medium', 'high'])
data_2013_y = pd.qcut(data_2013_y, q=3, labels=['low', 'medium', 'high'])
data_2014_y = pd.qcut(data_2014_y, q=3, labels=['low', 'medium', 'high'])

In [34]:
print(data_2010_y.value_counts())

low       2718
medium    2708
high      2665
Name: pm2.5, dtype: int64


In [18]:
data_2010_X = data_2010_X.values
data_2011_X = data_2011_X.values
data_2012_X = data_2012_X.values
data_2013_X = data_2013_X.values
data_2014_X = data_2014_X.values

In [31]:
def do_kmeans(X, y):
    kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    y_pred = kmeans.predict(X)
    accuracy = accuracy_score(y, y_pred)
    return kmeans, accuracy

In [33]:
kmeans, accuracy = do_kmeans(data_2010_X, data_2010_y)
print(accuracy)

2010
0.0
2011
0.0
2012
0.0
2013
0.0
2014
0.0
