# Rain Prediction (Sydney Dataset)

### Importing necessary libraries

In [92]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

### Reading CSV files into Dataframe

In [93]:
df = pd.read_csv('/kaggle/input/daily-weather-observations/Weather_Data.csv')
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


Removing 'Date' column and checking if all datatypes are compatible for training

In [94]:
df.drop('Date',axis=1,inplace=True)
df.dtypes

MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed      int64
WindDir9am        object
WindDir3pm        object
WindSpeed9am       int64
WindSpeed3pm       int64
Humidity9am        int64
Humidity3pm        int64
Pressure9am      float64
Pressure3pm      float64
Cloud9am           int64
Cloud3pm           int64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object

**WindGustDir,WindDir9am and WindDir3pm** are object type and are **categorical** variables so we need to convert them into numerical values to be used.     

### Mapping Categorical Columns

In [95]:
df['WindDir9am'].unique()

array(['S', 'W', 'ESE', 'NNE', 'SSW', 'WNW', 'N', 'SW', 'SE', 'SSE',
       'WSW', 'E', 'ENE', 'NW', 'NNW', 'NE'], dtype=object)

In [96]:
mapping1 = {'Yes' :  1,
            'No' : 0}

mapping2 = {'S': 0, 'W' : 1, 'ESE' : 2, 
            'NNE': 3, 'SSW' : 4, 'WNW' : 5, 
            'N' : 6, 'SW' : 7, 'SE' : 8, 'SSE' : 9,
            'WSW' : 10, 'E' : 11, 'ENE' : 12, 
            'NW' : 13, 'NNW' : 14, 'NE' : 15}



df['RainToday'] = df['RainToday'].map(mapping1)
df['WindGustDir'] = df['WindGustDir'].map(mapping2)
df['WindDir9am'] = df['WindDir9am'].map(mapping2)
df['WindDir3pm'] = df['WindDir3pm'].map(mapping2)

df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,19.5,22.4,15.6,6.2,0.0,1,41,0,4,17,...,92,84,1017.6,1017.4,8,8,20.7,20.9,1,Yes
1,19.5,25.6,6.0,3.4,2.7,1,41,1,11,9,...,83,73,1017.9,1016.4,7,7,22.4,24.8,1,Yes
2,21.6,24.5,6.6,2.4,0.1,1,41,2,2,17,...,88,86,1016.7,1015.6,7,8,23.5,23.0,1,Yes
3,20.2,22.8,18.8,2.2,0.0,1,41,3,11,22,...,83,90,1014.2,1011.8,8,8,21.4,20.9,1,Yes
4,19.7,25.7,77.4,4.8,0.0,1,41,3,1,11,...,88,74,1008.3,1004.8,8,8,22.5,25.5,1,Yes


Checking if there are any NULL or invalid values

In [97]:
df.isnull().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [98]:
df.drop('RainTomorrow',axis=1,inplace=True)

Dropped 'RainTomoorow' Column as it is of no use to us. Now we will shuffle our data to eliminate overfitting to new unseen data

In [99]:
df = df.sample(frac=1).reset_index(drop=True)

###  Splitting Data int Train, Test Sets

In [100]:
x = np.array(df[df.columns[:-1]])
y = np.array(df['RainToday'])
x,y

(array([[13.1, 25.7,  0. , ...,  4. , 19.3, 25.1],
        [10.6, 24.8,  0. , ...,  1. , 17.2, 22. ],
        [17.7, 22.9,  4.8, ...,  8. , 20.2, 22.3],
        ...,
        [ 8.8, 20.1,  9.8, ...,  1. , 10.3, 18.5],
        [21.2, 29.5,  0. , ...,  4. , 26.5, 25.6],
        [17.8, 28.9,  2. , ...,  4. , 22.3, 26.6]]),
 array([0, 0, 1, ..., 1, 0, 1]))

In [101]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2)
y_train.shape, y_test.shape, x_train.shape, x_test.shape

((2616,), (655,), (2616, 19), (655, 19))

### Training Model and Predicting

In [102]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

model = LogisticRegression(max_iter = 200)
model.fit(x_train, y_train)
preds = model.predict(x_test)

### Testing Accuracy

In [103]:
predprob = model.predict_proba(x_test)
predprob

array([[8.71590226e-01, 1.28409774e-01],
       [9.97454907e-01, 2.54509266e-03],
       [6.81431163e-06, 9.99993186e-01],
       ...,
       [9.92751735e-01, 7.24826530e-03],
       [9.99478441e-01, 5.21558793e-04],
       [0.00000000e+00, 1.00000000e+00]])

In [104]:
print(f"Accuracy Score: {accuracy_score(y_test, preds):.2f}\n"
      f"Jaccard Score: {jaccard_score(y_test, preds):.2f}\n"
      f"F1 Score: {f1_score(y_test, preds):.2f}\n"
      f"Log Loss: {log_loss(y_test, predprob):.2f}")

Accuracy Score: 1.00
Jaccard Score: 0.99
F1 Score: 0.99
Log Loss: 0.02


We can clearly see that our model has done a good job on predicting values of test data

## Author
[Muhammad Taha](https://www.linkedin.com/in/muhammad-taha-740bb4234/)