In [100]:
# libs
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import pandas
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np

In [101]:
# load data
data = pandas.read_csv("weather_forecast_data.csv")
data.describe()

Unnamed: 0,Temperature,Humidity,Wind_Speed,Cloud_Cover,Pressure
count,2475.0,2460.0,2468.0,2467.0,2473.0
mean,22.573777,64.366909,9.911826,49.80877,1014.409327
std,7.332397,19.973824,5.780705,29.062298,20.182257
min,10.001842,30.005071,0.009819,0.015038,980.014486
25%,16.356603,47.183183,4.782528,24.261914,997.029601
50%,22.53311,64.090914,9.908572,49.692078,1013.583677
75%,28.984159,81.561021,14.953142,75.417253,1031.762839
max,34.995214,99.997481,19.999132,99.997795,1049.985593


In [102]:
data.isnull().sum()

Temperature    25
Humidity       40
Wind_Speed     32
Cloud_Cover    33
Pressure       27
Rain            0
dtype: int64

In [103]:
# drop nulls of the data set
DF_DropNulls = data.copy()
DF_DropNulls.dropna(inplace=True)

In [104]:
# get numeric features only
numericFeatures = data.select_dtypes(include="number")

In [105]:
# loop on all cols, to replace all nulls of it with the mean of the col
numericFeaturesWithMean = numericFeatures.copy()
for column in numericFeatures.columns:
    featureMean = numericFeaturesWithMean[column].mean()
    numericFeaturesWithMean[column].fillna(featureMean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  numericFeaturesWithMean[column].fillna(featureMean, inplace=True)


feature scaling <br>
spliting data

In [106]:
from sklearn.preprocessing import StandardScaler

# scalling data 
scaler = StandardScaler()

# scaling data with mean 
for column in numericFeaturesWithMean.columns:
     numericFeaturesWithMean[column] = scaler.fit_transform( numericFeaturesWithMean[[column]])

# scaling data without nulls
for column in DF_DropNulls.select_dtypes(include="number").columns:
   DF_DropNulls[column] = scaler.fit_transform( DF_DropNulls[[column]])


In [107]:
# spliting training data and testing data
from sklearn.model_selection import train_test_split
label_encoder = LabelEncoder()

targetFeatureFor_DF_WithMean = label_encoder.fit_transform(data['Rain'])
targetFeatureFor_DF_WithoutNulls = label_encoder.fit_transform(DF_DropNulls['Rain'])
DF_WithoutNuls_numericFeatrues = DF_DropNulls.select_dtypes(include="number") 

x_withMean_train, x_withMean_test, y_withMean_train, y_withMean_test = train_test_split(numericFeaturesWithMean, targetFeatureFor_DF_WithMean, test_size=0.2, random_state=42)
x_withoutNulls_train, x_withoutNulls_test, y_withoutNulls_train, y_withoutNulls_test = train_test_split(DF_WithoutNuls_numericFeatrues, targetFeatureFor_DF_WithoutNulls, test_size=0.2, random_state=42)


print(y_withoutNulls_train[:10])

 
 

[0 0 0 0 0 0 0 0 0 0]


In [108]:
knn_with_mean = KNeighborsClassifier(n_neighbors=5)
knn_with_mean.fit(x_withMean_train, y_withMean_train)
y_pred_with_mean = knn_with_mean.predict(x_withMean_test)


In [109]:
print("Accuracy::", accuracy_score(y_withMean_test, y_pred_with_mean))
print("precision:",precision_score(y_withMean_test, y_pred_with_mean))
print("recall:",recall_score(y_withMean_test, y_pred_with_mean))

Accuracy:: 0.968
precision: 0.9166666666666666
recall: 0.7857142857142857


In [110]:
knn_without_nulls = KNeighborsClassifier(n_neighbors=5)
knn_without_nulls.fit(x_withoutNulls_train, y_withoutNulls_train)
y_pred_without_nulls = knn_without_nulls.predict(x_withoutNulls_test)

In [111]:
print("Accuracy:", accuracy_score(y_withoutNulls_test, y_pred_without_nulls))
print("precision:",precision_score(y_withoutNulls_test, y_pred_without_nulls))
print("recall:",recall_score (y_withoutNulls_test, y_pred_without_nulls))


Accuracy: 0.9617021276595744
precision: 0.890625
recall: 0.8382352941176471


In [112]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def predict(self, X_test):
        predictions = []
        for x_test in X_test.values: 
            distances = []
            for x_train in self.X_train.values:  
                distances.append(self.euclidean_distance(x_test, x_train))
            nearest_neighbors = np.argsort(distances)[:self.k]
            nearest_labels = self.y_train[nearest_neighbors]
            prediction = np.argmax(np.bincount(nearest_labels))
            predictions.append(prediction)
        return np.array(predictions)


In [113]:
knn = KNN(k=3)

knn.fit(x_withMean_train, y_withMean_train)

# Predict the labels for the test set
predictions = knn.predict(x_withMean_test)

euclidean_accuracy = accuracy_score(y_withMean_test,predictions)
print(euclidean_accuracy)


0.97


Decision Tree

In [114]:
# import libs
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

create train and test for data with mean

In [115]:
clf = DecisionTreeClassifier()

clf.fit(x_withMean_train, y_withMean_train)

y_withMean_predicted = clf.predict(x_withMean_test)

withMeanAccuracyScore = accuracy_score(y_withMean_test, y_withMean_predicted)
print(f"accuracy : {withMeanAccuracyScore * 100}%")

accuracy : 99.6%


In [116]:
clf = DecisionTreeClassifier()

clf.fit(x_withoutNulls_train, y_withoutNulls_train)

y_withoutNulls_predicted = clf.predict(x_withoutNulls_test)

withoutNullsAccuracyScore = accuracy_score(y_withoutNulls_test, y_withoutNulls_predicted)
print(f"accuracy : {withoutNullsAccuracyScore * 100}%")

accuracy : 99.7872340425532%
