# Attribute Information:

1. Date (DD/MM/YYYY)
2. Time (HH.MM.SS)
3. True hourly averaged concentration CO in mg/m^3 (reference analyzer)
4. PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)
5. True hourly averaged overall Non Metanic HydroCarbons concentration in microgm^3 (reference analyzer)
6. True hourly averaged Benzene concentration in microg/m^3 (reference analyzer)
7. PT08.S2 (titania) hourly averaged sensor response (nominally NMHC targeted)
8. True hourly averaged NOx concentration in ppb (reference analyzer)
9. PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)
10. True hourly averaged NO2 concentration in microg/m^3 (reference analyzer)
11. PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)
12. PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3   targeted)
13. Temperature in Â°C
14. Relative Humidity (%)
15. AH Absolute Humidity



In [0]:
# copying dataset to current folder
!cp drive/My\ Drive/Mini_Project_2020/AirQualityUCI.csv ./
#!ls

In [0]:
#importing dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics

In [0]:
# read .csv from provided dataset
csv_filename="AirQualityUCI.csv"

# df=pd.read_csv(csv_filename,index_col=0)
df=pd.read_csv(csv_filename, sep=";" , parse_dates= ['Date','Time'])

In [0]:
df.head()

In [0]:
df.dropna(how="all",axis=1,inplace=True)
df.dropna(how="all",axis=0,inplace=True)

In [0]:
df.shape

In [0]:
df.tail()

In [0]:
cols = list(df.columns[2:])

In [0]:
for col in cols:
    if df[col].dtype != 'float64':
        str_x = pd.Series(df[col]).str.replace(',','.')
        float_X = []
        for value in str_x.values:
            fv = float(value)
            float_X.append(fv)

            df[col] = pd.DataFrame(float_X)

df.head()

In [0]:
features=list(df.columns)
features.remove('Date')
features.remove('Time')
features.remove('C6H6(GT)')

In [0]:
X = df[features]
y = df['C6H6(GT)']

In [0]:
# split dataset to 60% training and 40% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=0)

In [0]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [0]:
# Applying PCA
# from sklearn.decomposition import PCA
# pca = PCA(n_components = 3)
# X_train = pca.fit_transform(X_train)
# X_test = pca.transform(X_test)
# explained_variance = pca.explained_variance_ratio_
# print(explained_variance)

In [0]:
print(X_train.shape, y_train.shape)
print(X_train)

In [0]:
#Decision tree regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X_train, y_train)

y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('RMSE train: %.3f, test: %.3f' % (np.sqrt(mean_squared_error(y_train, y_train_pred)),
                                        np.sqrt(mean_squared_error(y_test, y_test_pred))))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))