In [14]:
import os
import datetime
import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as t
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [3]:
# Import Dataset as a CSV file
#df = pd.read_csv(r"/home/nicky/Documents/ECE_4424/Final_Project/jena_climate_2009_2016.csv")
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv(r"/content/drive/My Drive/Colab Notebooks/weatherHistory.csv")
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [4]:
# Statistics of the Dataset
df.describe()

Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars)
count,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0
mean,11.932678,10.855029,0.734899,10.81064,187.509232,10.347325,0.0,1003.235956
std,9.551546,10.696847,0.195473,6.913571,107.383428,4.192123,0.0,116.969906
min,-21.822222,-27.716667,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.688889,2.311111,0.6,5.8282,116.0,8.3398,0.0,1011.9
50%,12.0,12.0,0.78,9.9659,180.0,10.0464,0.0,1016.45
75%,18.838889,18.838889,0.89,14.1358,290.0,14.812,0.0,1021.09
max,39.905556,39.344444,1.0,63.8526,359.0,16.1,0.0,1046.38


In [5]:
# Histogram plot of each value to further visualize any outliers
fig_temp = px.histogram(df, x = 'Temperature (C)')
fig_temp.show()

fig_apparent_temp = px.histogram(df, x = 'Apparent Temperature (C)')
fig_apparent_temp.show()

fig_humidity = px.histogram(df, x = 'Humidity')
fig_humidity.show()

fig_wind_speed = px.histogram(df, x = 'Wind Speed (km/h)')
fig_wind_speed.show()

fig_wind_bearing = px.histogram(df, x = 'Wind Bearing (degrees)')
fig_wind_bearing.show()

fig_visibility = px.histogram(df, x = 'Visibility (km)')
fig_visibility.show()

# Notice that the Loud Cover column is all zero and is therefore not necessary
fig_loud_cover = px.histogram(df, x = 'Loud Cover')
fig_loud_cover.show()

# Notice that the pressure histogram has many values at zero
# After manually looking at the dataset, I have decided that these values are either
# outliers or faulty measurements. For example, the pressure may be at 1000 at 5:00 p.m.
# then it will drop to 0 at 6:00 p.m. and return to 1000 at 7:00 p.m.
fig_pressure = px.histogram(df, x = 'Pressure (millibars)')
fig_pressure.show()



In [6]:
# The Loud Cover column is dropped because all values are zero
df = df.drop(columns=['Loud Cover'])
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,1016.51,Partly cloudy throughout the day.


In [7]:
# Function to Replace pressure zero values with the average of the last and next non-zero values
# This will get rid of the outliers in the pressure column

# The method to do so is to find all zeros in the pressure column
# Then find the last non-zero value and the next non-zero value
# Average these two values together and replace the zero with the averaged value

def interpolate_zeros(df, column_name):
  zero_indices = df.index[df[column_name] == 0].tolist()

  for i in zero_indices:
    last_non_zero_index = df.index[df.index < i][df[column_name][df.index < i] != 0].max()
    last_non_zero_value = df.loc[last_non_zero_index, column_name]

    # Find the next non-zero value after the zero value
    next_non_zero_index = df.index[df.index > i][df[column_name][df.index > i] != 0].min()
    next_non_zero_value = df.loc[next_non_zero_index, column_name]

    # Calculate the average of the last and next non-zero values
    average_value = (last_non_zero_value + next_non_zero_value) / 2

    # Update the zero value with the calculated average
    df.at[i, column_name] = average_value

  return df

# Notice that the histogram of the values for pressure is much more expected than before
df = interpolate_zeros(df, 'Pressure (millibars)')
fig_pressure = px.histogram(df, x = 'Pressure (millibars)')
fig_pressure.show()

remaining_zeros = df[df['Pressure (millibars)'] == 0]
remaining_zero_count = remaining_zeros.shape[0]
print(remaining_zero_count)


0


In [8]:
# Now that the data has been preprocessed, the next step is to create the featues and target and well as creating a training and testing dataset
features = df[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)']]
target = df['Summary']

# Change the summary string to integer values
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

# Create a Test and Training set
# The training set is 80% of the original dataset and the testset is 20%
X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.2, random_state=42)


def gini_index(groups, classes):
    n_instances = float(sum([len(group) for group in groups]))
    gini = 0.0
    for group in groups:
        size = float(len(group))
        if size == 0:
            continue
        score = 0.0
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        gini += (1.0 - score) * (size / n_instances)
    return gini

def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Convert DataFrame to list for the decision tree functions
dataset = pd.concat([X_train, pd.Series(y_train, index=X_train.index)], axis=1).values

# Test get_split function to find the best split
#split = get_split(dataset)
subset_size = 1000
np.random.seed(42)
subset_indices = np.random.choice(len(dataset), size=subset_size, replace=False)

# Creating the subset based on the selected indices
subset_dataset = [dataset[i] for i in subset_indices]
#subset_dataset = dataset[0:1000]

# Test getting the best split with the corrected smaller subset
split = get_split(subset_dataset)

print('Split: [X%d < %.3f]' % ((split['index']+1), split['value']))

Split: [X6 < 3.284]


In [9]:
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)

def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

tree = build_tree(subset_dataset, max_depth=3, min_size=10)
tree

{'index': 5,
 'value': 3.2844,
 'left': {'index': 5,
  'value': 0.161,
  'left': 6.0,
  'right': {'index': 0,
   'value': 12.433333333333334,
   'left': 12.0,
   'right': 12.0}},
 'right': {'index': 0,
  'value': 17.922222222222224,
  'left': {'index': 3, 'value': 4.4275, 'left': 19.0, 'right': 17.0},
  'right': {'index': 0,
   'value': 27.17222222222222,
   'left': 19.0,
   'right': 19.0}}}

In [10]:
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

row = subset_dataset[0]
prediction = predict(tree, row)
print("Predicted class:", prediction)

Predicted class: 17.0


In [11]:
def bootstrap_sample(data):
    indices = np.random.choice(len(data), size=len(data), replace=True)
    return [data[index] for index in indices]

def random_forest_train(data, n_trees, max_depth, min_size):
    forest = []
    for _ in range(n_trees):
        sample = bootstrap_sample(data)
        tree = build_tree(sample, max_depth, min_size)
        forest.append(tree)
    return forest

def random_forest_predict(forest, row):
    predictions = [predict(tree, row) for tree in forest]
    return max(set(predictions), key=predictions.count)

n_trees = 10
max_depth = 10
min_size = 10
forest = random_forest_train(subset_dataset, n_trees, max_depth, min_size)


In [12]:
forest_prediction = np.empty(10000, dtype=np.float64)
# Predict with the tiny random forest for a single row
for i in range(0, 1000):
  row = subset_dataset[i]  # Example row
  forest_prediction[i] = random_forest_predict(forest, row)

In [13]:
print("Forest Prediction:", forest_prediction)
print("\nTarget Encoded:", target_encoded[0:100])

count = 0
for i in range (0, 1000):
  if target_encoded[i] == forest_prediction[i]:
    count = count + 1

print(count)
print("Accuracy:", count/1000)


Forest Prediction: [1.70000e+001 1.70000e+001 1.90000e+001 ... 2.47097e-319 3.85223e-320
 1.29065e-319]

Target Encoded: [19 19 17 19 17 19 19 19 19 19 19 19 19 19 19 19 19 17 17 17 17 17 19 17
 19 19 17 19 19 19 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 18 18
 18 18 18 18 18 18 17 17 19 19 19 19 17 17 17 17 17 18 12 12 18 18 18 18
 12 12 17 18 18 18 18 18  3  3 18 17 17 17 17 17 18 18 18 18 17 18 17 18
 17 18 19 18]
277
Accuracy: 0.277


In [16]:
#The accuracy is not very good. Lets compare the model created to a built in model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
built_in_model_accuracy = accuracy_score(y_test, y_pred)

In [17]:
# The accuracy of the built in model is about 0.58 while the accuracy of the created model ranges from 0.27 - 0.40
print("Accuracy:", built_in_model_accuracy)

Accuracy: 0.5854025193095226
