In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, confusion_matrix, \
    precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier

# Preparing Data
*Load the data from a file path:*

In [6]:
weather_data = pd.read_csv('weather.csv', sep=',')

In [7]:
weather_data

Unnamed: 0.1,Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,24995,2013-03-09,Penrith,16.5,30.0,0.0,,,NE,26.0,...,41.0,,,,,21.6,29.4,No,0.0,No
24996,24996,2013-03-10,Penrith,16.7,30.5,0.0,,,NE,24.0,...,44.0,,,,,21.3,29.0,No,0.0,No
24997,24997,2013-03-11,Penrith,18.3,29.8,0.0,,,ENE,26.0,...,49.0,,,,,20.6,28.2,No,0.0,No
24998,24998,2013-03-12,Penrith,17.5,28.1,0.0,,,NNE,22.0,...,54.0,,,,,21.2,26.8,No,0.0,No


In [8]:
weather_data = weather_data.drop(['Unnamed: 0', 'Date'], axis=1)
target = weather_data['RainTomorrow']
target = target.map({'Yes': 1, 'No': 0})

In [9]:
weather_data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [10]:
target.head()

0    0
1    0
2    0
3    0
4    0
Name: RainTomorrow, dtype: int64

# Split the data
*Split the data into training and testing sets:*

Using Stratified Shuffle Split because our data is not neccesirly balanced

In [25]:
split = StratifiedShuffleSplit(n_splits=5, test_size=0.20, random_state=42)

for train_index, test_index in split.split(weather_data, target):
    strat_train_set = weather_data.loc[train_index]
    strat_test_set = weather_data.loc[test_index]

In [26]:
strat_train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 24567 to 8930
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       20000 non-null  object 
 1   MinTemp        19729 non-null  float64
 2   MaxTemp        19848 non-null  float64
 3   Rainfall       19777 non-null  float64
 4   Evaporation    7513 non-null   float64
 5   Sunshine       5277 non-null   float64
 6   WindGustDir    17210 non-null  object 
 7   WindGustSpeed  17211 non-null  float64
 8   WindDir9am     17547 non-null  object 
 9   WindDir3pm     18610 non-null  object 
 10  WindSpeed9am   19549 non-null  float64
 11  WindSpeed3pm   19010 non-null  float64
 12  Humidity9am    19686 non-null  float64
 13  Humidity3pm    19134 non-null  float64
 14  Pressure9am    16106 non-null  float64
 15  Pressure3pm    16112 non-null  float64
 16  Cloud9am       11302 non-null  float64
 17  Cloud3pm       11015 non-null  float64
 18  Tem

In [27]:
# separate the features into categorical and numerical features
train_features = strat_train_set.drop(['RainTomorrow'], axis=1).copy()
categorical_features = train_features[['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']]
numerical_features = train_features.drop(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday'], axis=1)
categorical_attribs = list(categorical_features)
numerical_attribs = list(numerical_features)

test_features = strat_test_set.drop(['RainTomorrow'], axis=1).copy()
categorical_features_test = test_features[['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']]
numerical_features_test = test_features.drop(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday'],axis=1)
categorical_attribs_test = list(categorical_features_test)
numerical_attribs_test = list(numerical_features_test)

target_train = target.loc[train_index]
target_test = target.loc[test_index]

# Data Pipeline
*Create Pipeline for Data and apply it to features:*

In [28]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])


full_pipeline = ColumnTransformer([
    ('num', numerical_pipeline, numerical_attribs),
    ('cat', OneHotEncoder(), categorical_attribs)
])

# Apply the full pipeline to the features
processed_train_features = full_pipeline.fit_transform(train_features)
processed_test_features = full_pipeline.transform(test_features)

# The Classifier
Defining a Classifier class help to do classifications with minimum code. 
It Fit, Predict, Evaluate and Test...

In [29]:
class Classifier:
    def __init__(self, feature, label, f_test, l_test):
        self.feature = feature
        self.label = label
        self.f_test = f_test
        self.l_test = l_test

    def logistic_classifier(self):
        print('\nLogistic Regression Started.......')
        model = LogisticRegression()
        fit = model.fit(self.feature, self.label)
        prediction = fit.predict(self.feature)
        rmse = np.sqrt(mean_squared_error(self.label, prediction))
        print(rmse)

    def decision_tree(self):
        print('\nDecision Tree Regression Started.......')
        model = DecisionTreeClassifier()
        fit = model.fit(self.feature, self.label)
        score = fit
        print(score)

    def random_forest(self):
        print('\nRandom Forest Started.......')
        model = RandomForestClassifier(max_depth=5, n_estimators=1000, random_state=42)
        fit = model.fit(self.feature, self.label)
        score = fit.score(self.f_test, self.l_test)
        print(f"Forest's Score is: {score}")

In [30]:
class_ = Classifier(processed_train_features, target_train, processed_test_features, target_test)
class_.random_forest()


Random Forest Started.......
Forest's Score is: 0.9628
