In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
import pydot
import matplotlib.pyplot as plt
import datetime

In [32]:
df = pd.read_csv('../Summer_2019_Projects/pollution.csv')
df.head()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0


In [33]:
# the dimensions of the data
# 43284 data values
# 13 different features
shape = df.shape
print(shape)

(43824, 13)


In [34]:
# statistical information about the data
df.describe()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,Iws,Is,Ir
count,43824.0,43824.0,43824.0,43824.0,43824.0,41757.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0
mean,21912.5,2012.0,6.523549,15.72782,11.5,98.613215,1.817246,12.448521,1016.447654,23.88914,0.052734,0.194916
std,12651.043435,1.413842,3.448572,8.799425,6.922266,92.050387,14.43344,12.198613,10.268698,50.010635,0.760375,1.415867
min,1.0,2010.0,1.0,1.0,0.0,0.0,-40.0,-19.0,991.0,0.45,0.0,0.0
25%,10956.75,2011.0,4.0,8.0,5.75,29.0,-10.0,2.0,1008.0,1.79,0.0,0.0
50%,21912.5,2012.0,7.0,16.0,11.5,72.0,2.0,14.0,1016.0,5.37,0.0,0.0
75%,32868.25,2013.0,10.0,23.0,17.25,137.0,15.0,23.0,1025.0,21.91,0.0,0.0
max,43824.0,2014.0,12.0,31.0,23.0,994.0,28.0,42.0,1046.0,585.6,27.0,36.0


In [35]:
# no need to do hot encoding to standardize the dates because the data set has already standardized the dates
# Separate the data into features and targets. 
# The target (label) is the thing that we are trying to predict which
# in this problem is the temperature
labels = np.array(df['TEMP'])
garbage = np.array(df['cbwd'])
df = df.drop('cbwd', axis = 1)
df = df.drop('TEMP', axis = 1)
features_list = list(df.columns)
df = np.array(df)

In [36]:
# Set up the data for training and for testing
# We split the data into training and target sets where use batch training
# because since we have a small data set, we can separate it for training and prediction
# Splitting the data set also allows for representation of all the data in the data set
# The splitting of the data is randomly seeded so the results can be reproducible

# X_TRAIN - The features for the training set
# Y_TRAIN - the labels or the thing we are trying to predict for the training set

# X_TEST - the features for testing on the data where we have not told the model what the features are4
# Y_TEST - the labels or the thing we are trying to predict where we have not shown the model what the actual labels are

# According to the sci-kit learn documentation, Decision Trees are an NP complete problem
# Since we currently have not proven the cases that P = NP or the case that P != NP we have 
# to use best approximations in the form of greedy algorithms and other techniques which are done under the hood of sci-kit learn
# Therefore (based on my understanding of the documentation, and I have no idea if this is right) random seeding helps to partly fix the problem
# with the NP complete problem for decision trees in the case of Random Forests and other ML algorithms / models that use decision trees
# As for what the size parameter is for (again based on my understanding of the documentation), it seems like the size parameter determines the size of the 
# decision trees for far deep subtrees can go after the root of the tree

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(df, labels, test_size = 0.25, random_state = 42)


In [37]:
# Information about the different shapes of our data
print('Training Features Shape:', X_TRAIN.shape)
print('Training Labels Shape:', Y_TRAIN.shape)
print('Testing Features Shape:', X_TEST.shape)
print('Testing Labels Shape:', Y_TEST.shape)

Training Features Shape: (32868, 11)
Training Labels Shape: (32868,)
Testing Features Shape: (10956, 11)
Testing Labels Shape: (10956,)


In [38]:
# The base threshold
# We establish a base threshold so that we can some target threshold that we want out our model to be better than
# Since we do not really have something to compare two and since I have no idea about what the different features are
# (excluding, no, month, date, year, hour, and temp) I chose IWS
#print(features_list)
base_prediction = X_TEST[:, features_list.index('DEWP')]

# calculation for the base threshold error
base_error = abs(base_prediction - Y_TEST)

print('The error (don\'t really know if I calculated it right):', round(np.mean(base_error), 2))

# So if we used DEWP for the error, then we have to beat a base threshold error of 10.6
# This seems really high but for right now, we can't really tell because we haven't trained the model
# nor done any comparisons with the base error




The error (don't really know if I calculated it right): 10.6


In [39]:
# Training the model

# create a new random forest with 1000 trees
print(features_list)





['No', 'year', 'month', 'day', 'hour', 'pm2.5', 'DEWP', 'PRES', 'Iws', 'Is', 'Ir']
