# kNN Solution to Weather.train Dataset
Written by Justin Clarke

# Pre-processing

In [32]:
import pandas as pd
import numpy as np

file_path = '../ML-Intern-Challenge-main/weather_train.csv'
data = pd.read_csv(file_path)
data


Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,110363,2014-01-25,Albany,18.4,26.8,0.2,4.8,8.5,,,...,59.0,72.0,1014.9,1009.2,0.0,3.0,24.0,23.8,No,No
1,22316,2012-05-12,NorfolkIsland,16.1,22.1,2.4,4.2,8.7,S,31.0,...,70.0,70.0,1020.2,1018.2,2.0,3.0,19.9,20.3,Yes,No
2,75456,2010-04-22,Portland,11.6,26.4,0.2,2.2,10.4,W,37.0,...,72.0,59.0,1018.4,1015.5,2.0,6.0,19.7,24.0,No,Yes
3,81222,2009-08-13,Dartmoor,6.8,16.5,0.8,1.4,6.7,NW,37.0,...,99.0,74.0,1015.0,1014.8,,,8.6,14.8,No,No
4,1753,2013-12-17,Albury,16.9,34.0,0.0,,,N,30.0,...,54.0,29.0,1019.5,1016.3,,4.0,24.7,31.8,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130909,21300,2009-07-01,NorfolkIsland,12.3,18.8,3.2,2.2,7.3,SSW,19.0,...,69.0,65.0,1017.7,1016.0,2.0,3.0,17.4,18.2,Yes,No
130910,29726,2016-04-02,Richmond,12.8,32.0,0.0,,,W,31.0,...,81.0,18.0,1016.8,1014.0,,,17.6,30.8,No,No
130911,78525,2010-06-21,Watsonia,5.1,13.8,2.6,2.0,7.5,ENE,13.0,...,100.0,60.0,1035.1,1034.2,2.0,4.0,6.7,13.4,Yes,No
130912,19355,2012-06-24,NorahHead,5.8,17.1,0.0,,,WNW,22.0,...,59.0,49.0,1028.6,1026.7,,,10.1,15.9,No,No


Firstly, we don't want to deal with any observations that have missing values so we simply remove them.

In [33]:
# Drop any rows with missing or non-numerical values
data.dropna(
    axis=0,
    how='any',
    thresh=None,
    subset=None,
    inplace=True
)
data

Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
1,22316,2012-05-12,NorfolkIsland,16.1,22.1,2.4,4.2,8.7,S,31.0,...,70.0,70.0,1020.2,1018.2,2.0,3.0,19.9,20.3,Yes,No
2,75456,2010-04-22,Portland,11.6,26.4,0.2,2.2,10.4,W,37.0,...,72.0,59.0,1018.4,1015.5,2.0,6.0,19.7,24.0,No,Yes
8,130392,2010-01-06,Hobart,12.9,21.5,0.0,6.0,10.9,WNW,56.0,...,60.0,32.0,1013.8,1012.8,6.0,4.0,13.4,18.9,No,No
11,141371,2014-12-06,Darwin,26.6,34.5,0.6,6.8,5.7,E,56.0,...,60.0,57.0,1008.7,1004.7,7.0,7.0,31.5,32.8,No,Yes
12,132573,2016-03-25,Hobart,11.4,21.0,0.0,3.4,3.4,NNW,43.0,...,55.0,41.0,1013.9,1012.7,7.0,7.0,13.9,19.8,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130901,95527,2015-04-25,Townsville,16.4,31.1,0.0,5.2,10.9,NE,30.0,...,65.0,50.0,1012.2,1008.8,1.0,6.0,25.7,29.3,No,No
130906,61603,2010-02-26,Sale,11.8,32.5,0.2,4.8,10.9,ESE,35.0,...,99.0,27.0,1022.9,1018.3,1.0,1.0,16.0,31.7,No,No
130907,78339,2009-12-17,Watsonia,16.7,22.7,0.0,15.2,0.5,SSW,46.0,...,67.0,99.0,1008.7,1007.3,7.0,8.0,21.7,19.3,No,Yes
130908,70831,2010-03-15,Mildura,15.7,32.3,0.0,8.0,11.3,W,37.0,...,55.0,23.0,1023.1,1020.3,1.0,2.0,21.0,31.0,No,No


Next, we take the features of the model that we want to use (i.e the numerical data) and in this step we also remove the 9am variables since we have 3pm ones and this will lower the complexity of the model and produce better results ideally.

In [34]:
data_features = ['MinTemp', 
                 'MaxTemp', 
                 'Rainfall', 
                 'Evaporation', 
                 'Sunshine', 
                 'WindGustSpeed', 
                 'Humidity3pm', 
                 'Pressure3pm', 
                 'Cloud3pm',
                 'Temp3pm',
                 'RainToday',
                 'RainTomorrow']
#9AM variables have been removed to lower complexity of model
#Remove non-numerical results


We also want to make the 'RainToday' and 'RainTomorrow' variables a bit more easy to use so we turn them into binary variables with 'Yes' corresponding to 1 and 'No' corresponding to 0. We also normalise the data so that each piece of data is in the range [0,1] so that the variance of the data is decreased.

In [35]:
#Turns yes or no data into usable binary data throught a mapping
data['RainToday'] = data['RainToday'].map({'Yes': 1, 'No': 0})
data['RainTomorrow'] = data['RainTomorrow'].map({'Yes': 1, 'No': 0})
X = data[data_features]

#Normalise the data
X_norm = (X - X.min()) / (X.max() - X.min())

X_norm

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,Humidity3pm,Pressure3pm,Cloud3pm,Temp3pm,RainToday,RainTomorrow
1,0.598425,0.409091,0.011639,0.051724,0.600000,0.191304,0.70,0.665049,0.333333,0.391509,1.0,0.0
2,0.480315,0.506818,0.000970,0.027094,0.717241,0.243478,0.59,0.621359,0.666667,0.478774,0.0,1.0
8,0.514436,0.395455,0.000000,0.073892,0.751724,0.408696,0.32,0.577670,0.444444,0.358491,0.0,0.0
11,0.874016,0.690909,0.002910,0.083744,0.393103,0.408696,0.57,0.446602,0.777778,0.686321,0.0,1.0
12,0.475066,0.384091,0.000000,0.041872,0.234483,0.295652,0.41,0.576052,0.777778,0.379717,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
130901,0.606299,0.613636,0.000000,0.064039,0.751724,0.182609,0.50,0.512945,0.666667,0.603774,0.0,0.0
130906,0.485564,0.645455,0.000970,0.059113,0.751724,0.226087,0.27,0.666667,0.111111,0.660377,0.0,0.0
130907,0.614173,0.422727,0.000000,0.187192,0.034483,0.321739,0.99,0.488673,0.888889,0.367925,0.0,1.0
130908,0.587927,0.640909,0.000000,0.098522,0.779310,0.243478,0.23,0.699029,0.222222,0.643868,0.0,0.0


# Training and Validation
A kNN model requires no training since the model is essentially 'memorising' the results. We instead define some functions to predict and then validate our models predictions. We use manhattan distance since we have a large number of features.

In [23]:
#Separate data into training and validation datasets
#train_pct = int(0.8 * len(X_norm)) Ideally make the validation set larger but computation was taking too long 
train_pct = int(len(X_norm)) - 100
X_train, X_val = X_norm[:train_pct], X_norm[train_pct:]

In [24]:
#Use a k-nearest neighbour algorithm using manhattan distance since it is a complex dataset with many features
def manhat_dist(row1, row2):
    dist = 0.0
    for i in range(len(row1)-1):
        dist += abs(float(row1[i]) - float(row2[i]))
    return dist

#locate the nearest neighbours from a given datapoint
def get_neighbors(train, test_row, k):
    distances = list()
    for train_row in train.to_numpy().tolist():
        dist = manhat_dist(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors

#Classification
def predict_weather(train, test_row, k):
    neighbors = get_neighbors(train, test_row, k)
    output_values = [row[-1] for row in neighbors]
    #If the neighbours have a majority of 1's, return True as it will most likely rain tomorrow. Otherwise return False
    return sum(output_values) >= (len(output_values) / 2)

#Determines the accuracy of the model
def accuracy(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        correct += (actual[i] == predicted[i])
        
    return correct / float(len(actual)) * 100.0

Finally we perform some validation to find the best value for k - our hyperparameter for number of neighbours we select and list out a range of values. The results we get are in order for 1, 5, 10, 25 and 50.

In [28]:
#Validation to find best hyperparameter k
k_value = [1, 5, 10, 25, 50]
actual = X_val.RainTomorrow.tolist()
for k in k_value:
    predictions = list()
    for row in X_val.to_numpy().tolist():
        output = predict_weather(X_train, row, k)
        predictions.append(output)
    print(accuracy(actual, predictions))

82.0
86.0
85.0
85.0
85.0


Finally to predict any future data we simple insert the entire X dataset and predict any sample we are given with k = 5 as found by our validation. As seen here, with a sample observation, the model predicts that it would not rain given these values.

In [29]:
#From this it appears that k = 5 gives the best results in terms of accuracy and as such we set k = 5 for our final model
#Our final model can be run with any prediction set and we can expect it to have ~85% accuracy 

sample = [11.2, 26.3, 2.5, 4.6, 9.1, 37.7, 59.2, 1015.1, 6.1, 24.3]
print(predict_weather(X, sample, 5))


False
