<a href="https://colab.research.google.com/github/minson18/PM2.5-Predict/blob/main/PM2.5_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
# Root Path
os.chdir('/content/drive/MyDrive/Data Mining/HW1')

Mounted at /content/drive


In [None]:
TRAIN_PATH = "train.csv"
TEST_PATH = "test_X.csv"

In [None]:
import numpy as np
import pandas as pd
import math
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score
import random
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [None]:
# Clean data: drop unneeded columns, replace invalid as -1
data_df = pd.read_csv(TRAIN_PATH)
data_df.drop(['Location             ', 'Date          ', 'ItemName                 '], axis = 1, inplace = True)
for col in data_df.columns:
  data_df[col] = pd.to_numeric(data_df[col], downcast="float", errors='coerce').fillna(-1)
raw_data = data_df.to_numpy()

In [None]:
# Transform data into forms that
# rows into continuous hours
# columns into air pollution indices
def transform(raw_data):
  days = raw_data.shape[0] // 18
  data = raw_data[0:18, :].T
  for i in range(1, days):
    b = raw_data[18*i:18*(i+1), :].T
    data = np.concatenate((data, b), axis=0)

  return data

In [None]:
# Filled invalid element by the next hour's value
def clean(a):
  for i in reversed(range(a.shape[0])):
    for j in range(a.shape[1]):
      if a[i][j] == -1:
        a[i][j] = a[i+1][j]
  
  return a

In [None]:
# Normalize data by ccolumns
def normalize(a):
  std = np.std(a, axis=0, dtype=np.float64)
  mean = np.mean(a, axis=0, dtype=np.float64)
  return (a-mean) / std

In [None]:
data = transform(raw_data)
data = clean(data)

In [None]:
# Construct training data, as using last 9 hours' data 
# to predict 10'th hour PM2.5 
X = []
y = []
for i in range(9, data.shape[0]):
  t = data[i-9:i, :]
  t = t.reshape(-1)
  X.append(t)
  y.append([data[i, 9]])

X = np.array(X)
y = np.array(y)
print(X.shape)
print(y.shape)
X = normalize(X)
X = np.concatenate([X , np.ones((X.shape[0],1))], axis = 1)

(5751, 162)
(5751, 1)


In [None]:
# Clean data: drop unneeded columns, replace invalid as -1
test_df = pd.read_csv(TEST_PATH, header = None)
indices = test_df[0].unique()
test_df.drop([0, 1], axis = 1, inplace = True)
for col in test_df.columns:
  test_df[col] = pd.to_numeric(test_df[col], downcast="float", errors='coerce').fillna(0)
raw_test = test_df.to_numpy()

In [None]:
test = transform(raw_test)
test = clean(test)

In [None]:
# Construct testing data, as using last 9 hours' data 
# to predict 10'th hour PM2.5 
X_test = []
for i in range(1, test.shape[0]//9+1):
  t = test[i*9-9:i*9, :]
  t = t.reshape(-1)
  X_test.append(t)

X_test = np.array(X_test)
print(X_test.shape)
X_test = normalize(X_test)
X_test = np.concatenate([X_test,np.ones(shape = (X_test.shape[0],1))] , axis = 1)

(244, 162)


In [None]:
# Find features that have correlation coefficient 
# with target larger than threshold
def feature_select(X, threshold=0.5):
  high_corr = []
  for i in range(X.shape[1]):  
    corr = np.corrcoef(X[:, i], y.reshape(-1))[0][1]
    if abs(corr) > threshold:
      high_corr.append(i)

  return high_corr

[8, 9, 26, 27, 44, 45, 62, 63, 80, 81, 92, 98, 99, 110, 116, 117, 128, 134, 135, 146, 147, 149, 152, 153, 157]


  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:
high_corr = feature_select(X)
part_X = X[:, high_corr]
part_X_test = X_test[:, high_corr]

In [None]:
# find partial derivative of lossfunction
def partial_derivative(X_batch, y_batch, m_stat):

  y_pred = np.dot(X_batch, m_stat)
  n = len(y_batch)
  df_dm = (-2/n) * np.dot(X_batch.T, (y_batch - y_pred))
  df_dm = df_dm.reshape(len(df_dm), -1)
  
  return df_dm

In [None]:
def MSE(X, y, m_stat):
  y_pred = np.dot(X, m_stat)
  #print(y_pred)
  mse = np.sum((y_pred - y)**2) / len(y)
  
  return mse

In [None]:
def training(X, y, batch_size, lr, epochs, reg_para=0):
  
  for epoch in range(epochs):

    #random initialize weight
    if epoch == 0:
      m_stat = np.random.rand(X.shape[1],1)

    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)

    X = X[indices]
    y = y[indices]
    
    cumulative_derivative = np.zeros((X.shape[1],1)) #store comulative derivative
    for batch in range(len(X)//batch_size):
      start = batch*batch_size
      stop = (batch*batch_size) + batch_size

      X_batch = X[start:stop]
      y_batch = y[start:stop]
            
      cumulative_derivative = cumulative_derivative + partial_derivative(X_batch, y_batch, m_stat)
      #derivative = partial_derivative(X_batch, y_batch, m_stat)

      #updating weight
      m_stat = m_stat - lr*(cumulative_derivative+2*reg_para*(m_stat**2))

    print(f"epoch: {epoch} ----> MSE: {MSE(X, y, m_stat)}")
      
  return m_stat

In [None]:
batch_size = 64
lr = 0.00001
epochs = 1000
#reg_para = 0.001

m_stat = training(X, y, batch_size, lr, epochs)

epoch: 0 ----> MSE: 521.4429739800258
epoch: 1 ----> MSE: 415.26557466446644
epoch: 2 ----> MSE: 283.37885901132574
epoch: 3 ----> MSE: 228.08387872154233
epoch: 4 ----> MSE: 190.7583037161752
epoch: 5 ----> MSE: 155.27458762932028
epoch: 6 ----> MSE: 131.87089205746088
epoch: 7 ----> MSE: 107.67204678442202
epoch: 8 ----> MSE: 96.80858796677154
epoch: 9 ----> MSE: 84.85687633480414
epoch: 10 ----> MSE: 76.06573483665208
epoch: 11 ----> MSE: 68.25481866978633
epoch: 12 ----> MSE: 60.16769336070729
epoch: 13 ----> MSE: 55.23317598373684
epoch: 14 ----> MSE: 50.797606432190484
epoch: 15 ----> MSE: 44.946488167750886
epoch: 16 ----> MSE: 39.689006028749446
epoch: 17 ----> MSE: 36.17216834190945
epoch: 18 ----> MSE: 32.934674493787355
epoch: 19 ----> MSE: 30.007260068020795
epoch: 20 ----> MSE: 27.822865260151296
epoch: 21 ----> MSE: 25.819584403751612
epoch: 22 ----> MSE: 24.203824955677913
epoch: 23 ----> MSE: 23.44864131243009
epoch: 24 ----> MSE: 22.177321878182454
epoch: 25 ----> MSE:

In [None]:
y_preds = (X_test @ m_stat) 
# Set y_preds<0 to 0
y_preds[y_preds<0] = 0

In [None]:
predictions = []
for i in range(len(indices)):
  predictions.append([indices[i], y_preds[i][0]])

In [None]:
# In case of colab's error, I write 2 times for ensuring
csv_writer = csv.writer(open('109550058.csv', 'w', newline=''))
csv_writer.writerow(["index", "answer"])
csv_writer.writerows(predictions)
csv_writer = csv.writer(open('109550058.csv', 'w', newline=''))
csv_writer.writerow(["index", "answer"])
csv_writer.writerows(predictions)