In [5]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing

In [6]:
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
dirname = 'dataset'

In [8]:
input_df = pd.read_csv(os.path.join(dirname, 'weatherAUS.csv'))

In [9]:
input_df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [10]:
input_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  object 
 2   MinTemp        141556 non-null  float64
 3   MaxTemp        141871 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81350 non-null   float64
 6   Sunshine       74377 non-null   float64
 7   WindGustDir    132863 non-null  object 
 8   WindGustSpeed  132923 non-null  float64
 9   WindDir9am     132180 non-null  object 
 10  WindDir3pm     138415 non-null  object 
 11  WindSpeed9am   140845 non-null  float64
 12  WindSpeed3pm   139563 non-null  float64
 13  Humidity9am    140419 non-null  float64
 14  Humidity3pm    138583 non-null  float64
 15  Pressure9am    128179 non-null  float64
 16  Pressure3pm    128212 non-null  float64
 17  Cloud9am       88536 non-null

## Making column header lowercase

In [None]:
input_df.columns = map(str.lower, input_df.columns)

Checking the summary of the dataset

In [None]:
input_df.describe()

Using matplotlib to plot a histogram for each numerical attribute

In [None]:
input_df.hist(bins=50, figsize=(20,15))

Points noted till now:
- rainfall, evaporation, windgustspeed, windspped9am, windspeed3pm, riskmm are tail heavy so need to be transformed to have more bell shaped distributions
- The attributes have different scales -> feature scaling

Now lets look at all the data that are of type object just to get the understanding.

In [None]:
input_df["location"].value_counts()

In [None]:
input_df['windgustdir'].value_counts()

In [None]:
input_df['winddir9am'].value_counts()

In [None]:
input_df['winddir3pm'].value_counts()

In [None]:
input_df['raintoday'].value_counts()

In [None]:
input_df['raintomorrow'].value_counts()

In this problem we are going to predict the feature raintomorrow.
We can clearly see that this dataset is not balanced.
Lets plot the graph too to see the imbalance nature of the dataset.

In [None]:
input_df.raintomorrow.value_counts(normalize=True).plot(kind='bar', color=['red', 'blue'])

## Train Test split
Lets convert our target column raintomorrow and another column raintoday from yes, no to 1, 0

In [None]:
input_df['raintoday'] = input_df['raintoday'].map({"Yes":1, "No": 0})
input_df['raintomorrow'] = input_df['raintomorrow'].map({"Yes":1, "No": 0})

Lets split the data set to train and test set before we handle the imbalance dataset.
We will prepare X Y data first.
Since we are going to predict whether it is going to rain or not tomorrow, we set raintomorrow as Y and everything else as X


In [None]:
# X = input_df.drop('raintomorrow', axis=1)
# y = input_df['raintomorrow']

In [None]:
# X.head()
# y

In [None]:
input_df.raintomorrow.value_counts()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(input_df, input_df['raintomorrow']):
    strat_train_set = input_df.loc[train_index]
    strat_test_set = input_df.loc[test_index]
    

In [None]:
print(strat_train_set.raintomorrow.value_counts())
print(strat_train_set.raintomorrow.value_counts()/len(strat_train_set))
print("-----------------------")
print(strat_test_set.raintomorrow.value_counts()/len(strat_test_set))
print(strat_test_set.raintomorrow.value_counts())

We want to handle the imbalanced dataset here:
According to this [link](https://www.kdnuggets.com/2020/01/5-most-useful-techniques-handle-imbalanced-datasets.html),
4 most useful techniques are:
- random undersampling and oversampling
- undersampling and oversampling using imbalanced learn
- class weights in the models
- changing teh evaluation metric

And according to this [link](https://elitedatascience.com/imbalanced-classes)
- upsample minority class
- down sample majority class
- change the performance metric
- penalize algorithms
- use tree based algorithms

And according to this website [machinelearningmastery](https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/), the author provides 8 tactics to combat imbalanced training data
1. Collect more data if possible
2. Try changing Performance Metric
   > Confusion Matrix
   > Precision
   > Recall
   > F1 Score (or F-score)
   Further advice to look at:
   > Kappa(or Cohen's kappa)
   > ROC Curves
3. Try resampling the dataset
4. Try generate synthetic samples
5. Try different Algorithms
    > decision trees often perform well on imbalanced datasets
6. Try Penalized Models
7. Try Different Perspective
    > anomaly detection and change detection

Since there are multiple methods we use few methods among them and evaluate the models

## Test1: Upsampling minority and down sampling majority class

In [None]:
from sklearn.utils import resample

In [None]:
df_majority = strat_train_set[strat_train_set.raintomorrow==0]
df_minority = strat_train_set[strat_train_set.raintomorrow==1]

Upsample

In [None]:
df_minority_upsampled = resample(df_minority, replace=True, n_samples=88252, random_state=42)

df_upsampled = pd.concat([df_majority, df_minority_upsampled])

df_upsampled.raintomorrow.value_counts()

Downsample

In [None]:
df_majority_downsampled = resample(df_majority, replace=False, n_samples=25502, random_state=42)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled.raintomorrow.value_counts()

## Checking Correlation

In [None]:
corr_matrix_up = df_upsampled.corr()
corr_matrix_down = df_downsampled.corr()

In [None]:
corr_matrix_up['raintomorrow'].sort_values(ascending=False)

In [None]:
corr_matrix_down['raintomorrow'].sort_values(ascending=False)

## Preparing the data for ML Algorithms

we will create X and y data for input features and output feature respectively for each sampled data

In [None]:
up_X = df_upsampled.drop("raintomorrow", axis=1)
up_y = df_upsampled["raintomorrow"].copy()

test_X = strat_test_set.drop("raintomorrow", axis=1)
test_y = strat_test_set["raintomorrow"].copy()

down_X = df_downsampled.drop("raintomorrow", axis=1)
down_y = df_downsampled["raintomorrow"].copy()


## Data Cleaning
we will use SimpleImputer from scikit-learn to replace each attribute's missing values with the median of that attribute

In [None]:
from sklearn.impute import SimpleImputer

imputer_up = SimpleImputer(strategy="median")
imputer_down = SimpleImputer(strategy="median")

Since median can only be computed on numerical attributes, we need to create a copy of data without object type.

In [None]:
up_X_num = up_X.drop(["date", "location", "windgustdir", "winddir9am", "winddir3pm", "raintoday"], axis=1)
down_X_num = down_X.drop(["date", "location", "windgustdir", "winddir9am", "winddir3pm", "raintoday"], axis=1)

up_X_cat = up_X[["date", "location", "windgustdir", "winddir9am", "winddir3pm", "raintoday"]].copy()
down_X_cat = down_X[["date", "location", "windgustdir", "winddir9am", "winddir3pm", "raintoday"]].copy()

Now we will fit the imputer instance to both the upsampled and downsampled data

In [None]:
imputer_up.fit(up_X_num)
imputer_down.fit(down_X_num)

In [None]:
print(up_X_num.median().values)
print("---------------------")
print(down_X_num.median().values)

Now we will use the trained imputer to transform the training set by replacing missing values with the learned medians

In [None]:
X_num_temp_up = imputer_up.transform(up_X_num)
X_num_temp_down = imputer_down.transform(down_X_num)

Since the results are plain Numpy array, we will create DF for the transformed data.


In [None]:
up_X_num_tr = pd.DataFrame(X_num_temp_up, columns=up_X_num.columns, index=up_X_num.index)
down_X_num_tr = pd.DataFrame(X_num_temp_down, columns=down_X_num.columns, index=down_X_num.index)

In [None]:
up_X_num_tr.info()

## Feature Scaling
According to the book (Hands on machine learning with scikit-learn, keras and tensorflow) there are two common ways to get all attribues to have the same scale: 
i) min-max scaling
ii) standardization

since min max scaling is more affected by outliers we will use standardization here.
We will use StandardScaler provided by scikit-learn


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
feat_scal_transformer = StandardScaler()

temp1 = feat_scal_transformer.fit_transform(up_X_num_tr)

In [None]:
temp1

## Handling missing values in categorical attributes

According to this [Link](https://medium.com/analytics-vidhya/ways-to-handle-categorical-column-missing-data-its-implementations-15dc4a56893), there are 3 ways to handle categorical data:
1. Frequent Categorical Imputation
> Here NaN values are replaced with the most frequent occured category in the column.
2. Adding a variable to capture NaN
> Here NaN is again replaced by most occuring value and a new feature added to introduce some weight to non imputed and imputed observations
3. Create a New Category for NaN Values
> here a new category is created for NaN values and added to all NaN values


Lets check the amount of NaN values to decide which method to use.

In [None]:
up_X_cat.isnull().sum()

In [None]:
list(up_X_cat.columns)

In [None]:
serr

Since we have 176504 total data and a column with highest NaN value is 12278 ie around 6% so we use the third method ie we will add unknown as the value for all NaN values.

In [None]:
from sklearn.impute import SimpleImputer

add_unknown_imputer = SimpleImputer(strategy="constant", fill_value="Unknown")


In [None]:
temp = add_unknown_imputer.fit_transform(up_X_cat)

In [None]:
temp

## Transforming date to month only

In [None]:
dtm_only_trans = DateToMonthOnlyTransformer()
temp_dtm_trans = dtm_only_trans.fit_transform(up_X_cat.date)

### Old method

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DateToMonthOnlyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    
    def get_month( self, obj ):
        return str(obj)[5:7]
    
    def transform(self, X,y=None):
        X = X.apply(self.get_month)
#         print(X)
        return X.values


In [None]:
def impute_nan_create_category(DataFrame, ColName):
    DataFrame[ColName] = np.where(DataFrame[ColName].isnull(), "Unknown", DataFrame[ColName])


In [None]:
columns = up_X_cat.columns
for column in list(up_X_cat.columns):
    impute_nan_create_category(up_X_cat, column)

In [None]:
up_X_cat.isnull().sum()

### Handling text and categorical attributes

In [None]:
up_X_cat.head(10)

Lets count the no. of categories in each columns except date and raintoday

In [None]:
print("Location: ", len(up_X_cat.location.unique()))
print("WindGustDir: ", len(up_X_cat.windgustdir.unique()))
print("WindDir9am: ", len(up_X_cat.winddir9am.unique()))
print("WindDir3pm: ", len(up_X_cat.winddir3pm.unique()))

Lets convert the categorical data using OneHotEncoderTransform

In [None]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder()

We will add one hot encoding process in the pipeline for only location and other direction categories too.

We try to get only the month from the date. So for this we will create an custom transformer.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class DateToMonthOnlyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    
    def get_month( self, obj ):
        return str(obj)[5:7]
    
    def transform(self, X,y=None):
        print(X)
        print("-------------------")
        X = X.apply(self.get_month)
        print(X)
        return np.array(X.values.tolist())

In [None]:
date_converter = DateToMonthOnlyTransformer()

In [None]:
print(date_converter.transform(up_X_cat.date))

## Pipelines
We are going to create the pipeline for the data cleaning step.
We will have seperate transformer for numerical data where:
* We use simple imputer with mean strategy for filling the NaN values

Another transformer will be for the categorical data where:
* We will use Simple imputer with strategy constant and fill Unknown for NaN values.
* We use OneHotEncoder for the columns with direction data.
* We use OrdinalEncoder for the location since the number of locations are too high and one hot encoder will have the vector of high dimension.
* We also use the custom transformer we created to change the date into only month.

In [None]:
up_X_cat

## Pipeline for numerical data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
num_attributes = ['mintemp', 'maxtemp', 'rainfall', 'evaporation', 'sunshine',
       'windgustspeed', 'windspeed9am', 'windspeed3pm', 'humidity9am',
       'humidity3pm', 'pressure9am', 'pressure3pm', 'cloud9am', 'cloud3pm',
       'temp9am', 'temp3pm', 'risk_mm']
cat_attributes = ['windgustdir', 'winddir9am', 'winddir3pm']

In [None]:
numerical_trans = Pipeline([
    ('simple_imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())],
)

In [None]:
# numerical_trans = ColumnTransformer([
#     ('simple_imputer', SimpleImputer(strategy="mean"), num_attributes),
#     ('standard_scaler', StandardScaler(), num_attributes)],
#     remainder="passthrough"
# )


## Pipeline for categorical data

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

categorical_trans = Pipeline(
    [('impute_unknown_for_nan', SimpleImputer(strategy="constant", fill_value="Unknown")),
#     ('one_hot_dir', OneHotEncoder()),
     ('ordinal_cat', OrdinalEncoder())
    ]
)
cat_loc_trans = Pipeline([
    ('ordinal_loc', OrdinalEncoder()),]
)

cat_date_trans = ColumnTransformer([
    ('date_transformer', DateToMonthOnlyTransformer(), ['date'])
])

In [None]:
# from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# categorical_trans = ColumnTransformer(
#     [('impute_unknown_for_nan', SimpleImputer(strategy="constant", fill_value="Unknown"), cat_attributes),
# #     ('one_hot_dir', OneHotEncoder(), cat_attributes),
#      ('ordinal_cat', OrdinalEncoder(), cat_attributes)
#     ],
#     remainder="passthrough"
# )
# cat_loc_trans = ColumnTransformer([
#     ('ordinal_loc', OrdinalEncoder(), ['location']),],
#     remainder="passthrough"
# )

# cat_date_trans = ColumnTransformer([
#     ('date_transformer', DateToMonthOnlyTransformer(), ['date'])],
#     remainder="passthrough"
#     )

In [None]:
full_pipeline = ColumnTransformer([
    ("num", numerical_trans, num_attributes),
    ("cat", categorical_trans, cat_attributes),
    ("loc", cat_loc_trans, ['location']),
#     ("date", DateToMonthOnlyTransformer(), 'date')
], remainder='drop')

## Train and Evaluate on the Training Set

According to [this](https://drgabrielharris.medium.com/python-how-scikit-learn-0-20-optimal-pipeline-and-best-practices-dc4dd94d2c09)

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=42)

pipeline_tree = Pipeline(steps=[('full', full_pipeline),
                           ('classifier', tree)])

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'classifier__criterion': ['entropy', 'gini'],
          'classifier__max_depth': [5,6,7],
          'classifier__min_samples_leaf': [4,5,6]}

classifier_gs = GridSearchCV(pipeline, params, scoring='roc_auc', cv=5, verbose=1)

In [None]:
classifier_gs.fit(up_X, up_y)

In [None]:
ypred = classifier_gs.predict(test_X)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(test_y, ypred))
print(confusion_matrix(test_y, ypred))

now use [this](https://www.kdnuggets.com/2018/01/managing-machine-learning-workflows-scikit-learn-pipelines-part-3.html) to extend for many other classifiers.

Using [this site](https://www.kdnuggets.com/2018/01/managing-machine-learning-workflows-scikit-learn-pipelines-part-3.html) for training using multiple models


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


In [None]:
tree = DecisionTreeClassifier(random_state=42)

pipeline_tree = Pipeline(steps=[('full', full_pipeline),
                           ('classifier', tree)])

In [None]:
lr = LogisticRegression(random_state=42)

pipeline_lr = Pipeline(steps=[('full', full_pipeline),
                              ('classifier', lr)])

In [None]:
rf = RandomForestClassifier(random_state=42)

pipeline_rf = Pipeline(steps=[('full', full_pipeline),
                              ('classifier', rf)])

In [None]:
svm = svm.SVC(random_state=42)

pipeline_svm = Pipeline(steps=[('full', full_pipeline),
                              ('classifier', svm)])

In [None]:
# Set grid search params

param_range = [1,2,3,4,5,6,7,8,9,10]

grid_params_dec_tree = [{'classifier__criterion': ['entropy', 'gini'],
          'classifier__max_depth': [5,6,7],
          'classifier__min_samples_leaf': [4,5,6]}]

grid_params_lr = [{'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['liblinear']}] 

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
        'clf__min_samples_leaf': param_range,
        'clf__max_depth': param_range,
        'clf__min_samples_split': param_range[1:]}]

grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
        'clf__C': param_range}]

In [None]:
# Construct the grid search
jobs = -1

gs_tree = GridSearchCV(estimator=pipeline_tree,
            param_grid=grid_params_dec_tree,
            scoring='roc_auc',
            cv=5)

gs_lr = GridSearchCV(estimator=pipeline_lr,
            param_grid=grid_params_lr,
            scoring='roc_auc',
            cv=10) 

gs_rf = GridSearchCV(estimator=pipeline_rf,
            param_grid=grid_params_rf,
            scoring='roc_auc',
            cv=10, 
            n_jobs=jobs)


gs_svm = GridSearchCV(estimator=pipeline_svm,
            param_grid=grid_params_svm,
            scoring='roc_auc',
            cv=10,
            n_jobs=jobs)

In [None]:
# List of pipelines for ease of iteration
grids = [gs_tree, gs_lr, gs_rf, gs_svm]

# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'Decision Tree', 1: 'Logistic Regression', 2: 'Random Forest',
             3: 'Support Vector Machine'}
# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])	
    # Fit grid search	
    gs.fit(up_X, up_y)
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(test_X)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(test_y, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(test_y, y_pred) > best_acc:
        best_acc = accuracy_score(test_y, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])


In [None]:
lr.get_params().keys()