In [301]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import average_precision_score, accuracy_score
from scipy.stats import mode

In [329]:
data = pd.read_csv('credit_data_simulate.csv')

In [330]:
class CreditData:
    def __init__(self, data):
        self.data = data

    def split_data(self, test_size=0.2):
        return train_test_split(self.data, test_size=test_size)

    def k_fold(self, k=10):
        self.data = shuffle(self.data)
        
        X = self.data.iloc[:, :-1].values
        y = self.data.iloc[:, -1].values
        kf = KFold(n_splits=k, shuffle=False)
        folds = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            folds.append(((X_train, y_train), (X_test, y_test)))
        return folds
        
    def normalize(self, scaler):
        X = self.data.iloc[:, 0:-1]
        y = self.data.iloc[:, -1]
        X = scaler.fit_transform(X)
        return X, y
    
    def knn(self, X_train, y_train, X_test, y_test, k=5, metric='minkowski'):
        knn = KNeighborsClassifier(n_neighbors=k, metric=metric, p=2)
        if metric == 'seuclidean':
            V = np.var(X_train, axis=0)
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric, metric_params={'V': V})
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        y_pred_proba = knn.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
        cm = confusion_matrix(y_test, y_pred)
        return cm, y_pred_proba

    def plot_confusion_matrix(self, cm):
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title('Confusion matrix')
        plt.colorbar()
        plt.show()
    
    def get_metrics(self, y_test, y_pred_proba, cm):
        tn, fp, fn, tp = cm.ravel()
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)
        fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        auprc = average_precision_score(y_test, y_pred_proba)
        d_index = np.log2(1+accuracy) + np.log2(1 + ((recall + precision)/2))
        return [accuracy, precision, recall, f1, d_index, roc_auc, fpr, tpr, auprc]



In [None]:
credit_data = CreditData(data)
folds = credit_data.k_fold(k=10)
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler(), MaxAbsScaler()]
for scaler in scalers:
    print(scaler)
    for fold in folds:
        (X_train, y_train), (X_test, y_test) = fold
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        cm, y_pred_proba = credit_data.knn(X_train_scaled, y_train, X_test_scaled, y_test, k=5, metric='seuclidean')
        credit_data.plot_confusion_matrix(cm)
        metrics = credit_data.get_metrics(y_test, y_pred_proba, cm)
        print("accuracy:", metrics[0])
        print("precision:", metrics[1])
        print("recall:", metrics[2])
        print("f1:", metrics[3])
        print("d_index:", metrics[4])
        print("roc_auc:", metrics[5])
        print("auprc:", metrics[8])
        plt.plot(metrics[6], metrics[7])
        plt.show()


In [273]:
# import data and clean by removing all rows with missing values
data = pd.read_csv('PM2.5_Beijing_2010_2014.csv')
data = data.dropna()

#  save as clean_pm2.5_data.csv
data.to_csv('clean_pm2.5_data.csv', index=False)

In [274]:
# convert cbwd column to numerical values
data = pd.read_csv('clean_pm2.5_data.csv')
data['cbwd'] = data['cbwd'].astype('category')
data['cbwd'] = data['cbwd'].cat.codes
data = data.drop(columns=['No', 'Is', 'Ir'])
data['target'] = pd.qcut(data['pm2.5'], q=3, labels=['low', 'medium', 'high'])
data = data.drop(columns=['pm2.5'])
data.to_csv('clean_pm2.5_data.csv', index=False)



In [275]:
data = pd.read_csv('clean_pm2.5_data.csv')

data_year = data.groupby('year')
data_2010 = data_year.get_group(2010)
data_2011 = data_year.get_group(2011)
data_2012 = data_year.get_group(2012)
data_2013 = data_year.get_group(2013)
data_2014 = data_year.get_group(2014)

data_2010_y = data_2010['target']
data_2011_y = data_2011['target']
data_2012_y = data_2012['target']
data_2013_y = data_2013['target']
data_2014_y = data_2014['target']

data_2010_X = data_2010.drop(columns=['year', 'target'])
data_2011_X = data_2011.drop(columns=['year', 'target'])
data_2012_X = data_2012.drop(columns=['year', 'target'])
data_2013_X = data_2013.drop(columns=['year', 'target'])
data_2014_X = data_2014.drop(columns=['year', 'target'])

In [276]:
#  scale data
scaler = MinMaxScaler()
data_2010 = scaler.fit_transform(data_2010_X)
data_2011 = scaler.fit_transform(data_2011_X)
data_2012 = scaler.fit_transform(data_2012_X)
data_2013 = scaler.fit_transform(data_2013_X)
data_2014 = scaler.fit_transform(data_2014_X)

In [277]:

columns = ['month', 'day', 'hour', 'DEWP', 'TEMP', 'PRES', 'cbwd', 'Iws']
data_2010 = pd.DataFrame(data_2010, columns=columns)
data_2011 = pd.DataFrame(data_2011, columns=columns)
data_2012 = pd.DataFrame(data_2012, columns=columns)
data_2013 = pd.DataFrame(data_2013, columns=columns)
data_2014 = pd.DataFrame(data_2014, columns=columns)


In [278]:
# write to new file
with open('data_2010.csv', 'w') as f:
    data_2010.to_csv(f, index=False)
with open('data_2011.csv', 'w') as f:
    data_2011.to_csv(f, index=False)
with open('data_2012.csv', 'w') as f:
    data_2012.to_csv(f, index=False)
with open('data_2013.csv', 'w') as f:
    data_2013.to_csv(f, index=False)
with open('data_2014.csv', 'w') as f:
    data_2014.to_csv(f, index=False)

In [279]:
data_2010_X = data_2010_X.values
data_2011_X = data_2011_X.values
data_2012_X = data_2012_X.values
data_2013_X = data_2013_X.values
data_2014_X = data_2014_X.values

In [292]:
def resolve_masking(y_pred, y_true):
    pred_labels = np.zeros_like(y_true)
    for i in range(len(np.unique(y_pred))):
        mask = (y_pred == i)
        pred_labels[mask] = mode(y_true[mask])[0]
    return pred_labels

In [296]:
def do_kmeans(X, y):
    # turn y into 0, 1, 2
    y = y.astype('category')
    y_numeric = y.cat.codes
    kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    y_pred = kmeans.predict(X)
    y_pred = resolve_masking(y_pred, y_numeric)
    accuracy = accuracy_score(y_numeric, y_pred)
    return y_numeric, y_pred, accuracy



In [294]:
# plot the clusters
def plot_clusters(X, y, kmeans):
    y = y.astype('category')
    y_numeric = y.cat.codes

    plt.scatter(X[:, 0], X[:, 1], c=y_numeric)
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
    plt.show()

In [327]:
y, y_pred, accuracy = do_kmeans(data_2010_X, data_2010_y)
print(accuracy)

y, y_pred, accuracy = do_kmeans(data_2011_X, data_2011_y)
print(accuracy)

y, y_pred, accuracy = do_kmeans(data_2012_X, data_2012_y)
print(accuracy)

y, y_pred, accuracy = do_kmeans(data_2013_X, data_2013_y)
print(accuracy)

y, y_pred, accuracy = do_kmeans(data_2014_X, data_2014_y)
print(accuracy)

0.41947843282659747
0.3930527888446215
0.37371910789632307
0.3733579165706384
0.3747835122965016


  pred_labels[mask] = mode(y_true[mask])[0]
  pred_labels[mask] = mode(y_true[mask])[0]
  pred_labels[mask] = mode(y_true[mask])[0]
  pred_labels[mask] = mode(y_true[mask])[0]
  pred_labels[mask] = mode(y_true[mask])[0]


In [339]:
data = pd.read_csv('PM2.5_Beijing_2010_2014.csv')
data = data.dropna()
data = data.drop(columns=['No', 'Is', 'Ir'])
# samples with pm2.5 values less than 100 are low, otherwise high
data['target'] = np.where(data['pm2.5'] < 100, 'low', 'high')
data = data.drop(columns=['pm2.5'])
# convert to numerical values, not category
data['cbwd'] = data['cbwd'].astype('category')
data['cbwd'] = data['cbwd'].cat.codes
data.to_csv('clean_pm2.5_data2.csv', index=False)



In [343]:
class PM25Data:
    def __init__(self, data):
        self.data = data

    def split_data(self, test_size=0.2):
        # use year 2010 - 2013 as training data and 2014 as test data
        X_train = self.data[self.data['year'] < 2014].drop(columns=['year', 'target'])
        y_train = self.data[self.data['year'] < 2014]['target']
        X_test = self.data[self.data['year'] == 2014].drop(columns=['year', 'target'])
        y_test = self.data[self.data['year'] == 2014]['target']
        return X_train, y_train, X_test, y_test
    
    def do_knn(self, X_train, y_train, X_test, y_test, k=5, metric='minkowski'):
        knn = KNeighborsClassifier(n_neighbors=k, metric=metric, p=2)
        if metric == 'seuclidean':
            V = np.var(X_train, axis=0)
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric, metric_params={'V': V})
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        y_pred_proba = knn.predict_proba(X_test)[:, 1]
        cm = confusion_matrix(y_test, y_pred)
        return cm, y_pred_proba
    
    def plot_confusion_matrix(self, cm):
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title('Confusion matrix')
        plt.colorbar()
        plt.show()
    

In [None]:
# do knn on the data
data = pd.read_csv('clean_pm2.5_data2.csv')

data['cbwd'] = data['cbwd'].astype('category')


pm25_data = PM25Data(data)
X_train, y_train, X_test, y_test = pm25_data.split_data()
cm, y_pred_proba = pm25_data.do_knn(X_train, y_train, X_test, y_test, k=5, metric='euclidean')
pm25_data.plot_confusion_matrix(cm)
cm, y_pred_proba = pm25_data.do_knn(X_train, y_train, X_test, y_test, k=5, metric='correlation')
pm25_data.plot_confusion_matrix(cm)
cm, y_pred_proba = pm25_data.do_knn(X_train, y_train, X_test, y_test, k=5, metric='cosine')
pm25_data.plot_confusion_matrix(cm)
cm, y_pred_proba = pm25_data.do_knn(X_train, y_train, X_test, y_test, k=5, metric='chebyshev')
pm25_data.plot_confusion_matrix(cm)
cm, y_pred_proba = pm25_data.do_knn(X_train, y_train, X_test, y_test, k=5, metric='seuclidean')
pm25_data.plot_confusion_matrix(cm)

