# AQI Prediction Model by Team Visio

## 1. Preparation

## 1.1 Library 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

#model
from sklearn.ensemble import RandomForestClassifier

## 1.2 Functions

In [None]:
def precision(label, confusion_matrix):
    col = confusion_matrix[:, label]
    return confusion_matrix[label, label] / col.sum()
    
def recall(label, confusion_matrix):
    row = confusion_matrix[label, :]
    return confusion_matrix[label, label] / row.sum()

def accuracy(confusion_matrix):
    diagonal_sum = confusion_matrix.trace()
    sum_of_all_elements = confusion_matrix.sum()
    return diagonal_sum / sum_of_all_elements 

def output(test_model):
    if test_model == 0:
        print('AQI Result is Good, between 0 & 50')
    elif test_model == 1:
        print('AQI Result is Moderate, between 51 & 100')
    elif test_model == 2:
        print('AQI Result is Unhealthy for sensitive group, between 101 & 150')
    elif test_model == 3:
        print('AQI Result is Unhealthy, between 151 & 200')
    elif test_model == 4:
        print('AQI Result is Very Unhealthy, between 201 & 300')
    else:
        print('AQI Result is Hazardous, between 301 & 500')
    return test_model

## 2. Data

### 2.1 Import Data

In [None]:
file = r'Data_For_ML.xlsx'
data = pd.read_excel(file)
data = pd.DataFrame.from_records(data)

#### 2.1.1 Check the data

In [None]:
print(data.columns)

In [None]:
data.describe()

### 2.2 Prepare Datasets

In [None]:
train = data.sample(frac = 0.90)
test = data.drop(train.index) 

#### 2.2.1 Training Dataset

In [None]:
# Without COVID Situation
x_train = train.drop(['Created At', 'MM Date_original', 'MM Time', 'MM Date', 'Year', 'Date', 'COVID Situation',
       'PM1.0_CF1_ug/m3', 'PM2.5_CF1_ug/m3', 'PM10.0_CF1_ug/m3','UptimeMinutes', 'RSSI_dbm', 'Temperature_F',
       'PM2.5_ATM_ug/m3', 'PM2.5_AQI Hourly Average', 'Sensor', 'Pcode', 'Air Quality', 'Air Quality No'], axis=1)

# With COVID Situation
xC_train = train.drop(['Created At', 'MM Date_original', 'MM Time', 'MM Date', 'Year', 'Date',
       'PM1.0_CF1_ug/m3', 'PM2.5_CF1_ug/m3', 'PM10.0_CF1_ug/m3','UptimeMinutes', 'RSSI_dbm', 'Temperature_F',
       'PM2.5_ATM_ug/m3', 'PM2.5_AQI Hourly Average', 'Sensor', 'Pcode', 'Air Quality', 'Air Quality No'], axis=1)

# Output
y_train = train.drop(['Created At', 'MM Date_original', 'MM Time', 'MM Date',
       'PM1.0_CF1_ug/m3', 'PM2.5_CF1_ug/m3', 'PM10.0_CF1_ug/m3',
       'UptimeMinutes', 'RSSI_dbm', 'Temperature_F', 'Humidity_%',
       'PM2.5_ATM_ug/m3', 'Year', 'Month', 'Hour', 'PM2.5_AQI Hourly Average',
       'Date', 'COVID Situation', 'Sensor', 'Pcode', 'Air Quality'], axis=1)

#### 2.2.2 Testing Dataset

In [None]:
# Without COVID Situation
x_test = test.drop(['Created At', 'MM Date_original', 'MM Time', 'MM Date', 'Year', 'Date', 'COVID Situation',
       'PM1.0_CF1_ug/m3', 'PM2.5_CF1_ug/m3', 'PM10.0_CF1_ug/m3','UptimeMinutes', 'RSSI_dbm', 'Temperature_F',
       'PM2.5_ATM_ug/m3', 'PM2.5_AQI Hourly Average', 'Sensor', 'Pcode', 'Air Quality', 'Air Quality No'], axis=1)

# With COVID Situation
xC_test = test.drop(['Created At', 'MM Date_original', 'MM Time', 'MM Date', 'Year', 'Date',
       'PM1.0_CF1_ug/m3', 'PM2.5_CF1_ug/m3', 'PM10.0_CF1_ug/m3','UptimeMinutes', 'RSSI_dbm', 'Temperature_F',
       'PM2.5_ATM_ug/m3', 'PM2.5_AQI Hourly Average', 'Sensor', 'Pcode', 'Air Quality', 'Air Quality No'], axis=1)

# Output
y_test = test.drop(['Created At', 'MM Date_original', 'MM Time', 'MM Date',
       'PM1.0_CF1_ug/m3', 'PM2.5_CF1_ug/m3', 'PM10.0_CF1_ug/m3',
       'UptimeMinutes', 'RSSI_dbm', 'Temperature_F', 'Humidity_%',
       'PM2.5_ATM_ug/m3', 'Year', 'Month', 'Hour', 'PM2.5_AQI Hourly Average', 
       'Date', 'COVID Situation', 'Sensor', 'Pcode', 'Air Quality'], axis=1)

#### 2.2.3 Check the data

In [None]:
x_train.head()
# x_test.head()

In [None]:
xC_train.head()
# xC_test.head()

In [None]:
y_train.head()

## 3. Model

### 3.1 Choosing Model

In [None]:
model = RandomForestClassifier() # Without COVID Situation
model_C = RandomForestClassifier() # With COVID Situation

#model = DecisionTreeClassifier()
#model_C = DecisionTreeClassifier()

#### 3.1.1 Check the model

In [None]:
print(model)

In [None]:
print(model_C)

### 3.2 Training Model

In [None]:
model.fit(x_train, np.ravel(y_train,order='C')); # Without COVID Situation
model_C.fit(xC_train, np.ravel(y_train,order='C')); # With COVID Situation

### 3.2.1 Without COVID Situation

In [None]:
print(model)

#### 3.2.1.1 Predict the result

In [None]:
y = model.predict(x_test)
y_t = model.predict(x_train)

#### 3.2.1.2 Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, y)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');

#### 3.2.1.3 Accuracy

In [None]:
accuracy(mat)

In [None]:
# for i in range(5):
#     print(precision(i,mat))

### 3.2.2 With COVID Situation

In [None]:
print(model)

#### 3.2.2.1 Predict the result

In [None]:
yC = model_C.predict(xC_test)
yC_t = model_C.predict(xC_train)

#### 3.2.1.2 Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
matC = confusion_matrix(y_test, yC)
sns.heatmap(matC.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');

#### 3.2.2.3 Accuracy

In [None]:
accuracy(matC)

In [None]:
# for i in range(5):
#     print(precision(i,matC))

### 3.3 Testing Model

In [None]:
test_data = [[79, 11, 6]]  #[Humidity,Month,Hour]
test_data_C = [[79, 11, 6, 5]]  #[Humidity,Month,Hour,COVID Situation]

#### 3.3.1 Without Covid Situation

In [None]:
test_model = model.predict(test_data)

In [None]:
output(test_model)

#### 3.3.2 With Covid Situation

In [None]:
test_model_C = model_C.predict(test_data_C)

In [None]:
output(test_model_C)

## 4. Prediction


In [None]:
AQI_status = ['Air Quality is Good','Air Quality is Moderate','Air Quality is Unhealthy for sensitive group','Air Quality is Unhealthy','Air Quality is Very Unhealthy','Air Quality is Hazardous']
a = 0
df = pd.DataFrame(columns = ['Hour', 'Humidity', 'AQI Status'])

In [None]:
samples = np.zeros((24, 100))
for i in range(24):
  for j in range(100):
   sampleX = [[j, 12, i, 4]]
   p = model_C.predict(sampleX)
   p = p.item()
   samples[i,j] = p
   print(AQI_status[p])
   df = df.append({'Hour': [i], 'Humidity': [j], 'AQI Status': AQI_status[p]}, ignore_index = True)

In [None]:
df.to_csv('VISIO_01.csv', index=False)