In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# This function is needed later when evaluating the classifier's results.
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    fig = plt.figure()

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = np.around(cm, decimals=2, out=None)  
    
    
    thresh = cm.max() / 2.
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    fig.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### ILV01.0 Read data into pandas dataframe

Dataframes are two-dimensional in nature
- organized in a row/column structure just as a spreadsheet
- The main benefits of using Dataframes: a Dataframe can handle much larger data than most common spreadsheet software.

ILV01.0 a) Specify path to the CSV containing weather data from Australia and load it into memory using pandas.


In [4]:
df = pd.read_csv('datasets/weatherAUS.csv')

ILV01.0 b) Access some rows to get an idea of the data's structure.

In [0]:
# TODO: use functions 'head', 'sample' or 'tail' to display data


### ILV01.1 Preprocessing
#### Try to extract useful information about the dataset using tools you already got to know during the Feature Engineering course.

ILV01.1 a) How many rows/columns are in this dataset?

In [0]:
# TODO: use attribute 'shape' of dataframe to access dimensions, save original shape for later use


ILV01.1 b) Examine datatypes ('dtypes' in pandas) of the columns in your dataset.

In [0]:
# TODO: use function 'info' or attribute 'dtypes'

ILV01.1 c) Examine statistics of the columns in your dataset. Print mean, std, min and max for each column.

In [0]:
# TODO: use function 'describe' without parameters to examine statistical characteristics of numeric columns

In [0]:
# TODO: use the parameter include of the 'describe' function to print statistics of other datatypes

#### Missing Values

ILV01.1 d) Drop columns that contain > 1/3 NaN values.

In [0]:
# TODO: Display the number of NaNs in each columns with function 'isna'

In [0]:
# TODO: Iterate over columns in dataframe to drop each column that contains >1/3 NaNs.

In [0]:
# TODO: Display the number of NaNs in each columns with function 'isna'

In [0]:
# TODO: use attribute 'shape' of dataframe to access dimensions

ILV01.1 e) Choose what to do about the remaining number of NaNs:

- Option 0: Decide on a strategy to fill missing values. Be aware: columns have to be treated individually to account for numeric and non-numeric datatypes.
- Option 1: Drop rows containing missing values.

In [0]:
# TODO: decide which option you want to choose.
# 0: iterate over columns, determine if the column is numeric, and use function 'fillna' to fill in the mean of this column for each missing value.
#    if column is non-numeric, use pandas Class 'Categorical' and its attribute 'codes' to retrieve a numerical representation of the column.
# 1: drop each row that contains NaNs by using function 'dropna' with the correct axis
option = 1


In [0]:
# TODO: Display the number of NaNs in each columns with function 'isna'

In [0]:
# TODO: use attribute 'shape' of dataframe to access dimensions

ILV01.1 f) Drop RISK_MM as it contains the amount of rain that was recorded on the next day in millimeters which gives away too much information for your model. Read more here: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package/discussion/78316)

In [0]:
# TODO: use function 'drop' with its 'columns' parameter to drop specific columns by name.


In [0]:
# TODO: use attribute 'shape' of dataframe to access dimensions


ILV01.1 g) Encode remaining non-numerical values into categories (integers).

In [0]:
# TODO: use pandas Class 'Categorical' and its attribute 'codes' to retrieve a numerical representation of the column.
# Do this only for non-numeric columns in the dataframe.

In [0]:
# TODO: print head & info of dataframe to check contents and datatypes of dataframe!

In [0]:
df.info()

#### Extract Labels

ILV01.1 h) Construct X and Y. Where X contains all numeric columns that are possible feature candidates and Y denotes the corresponding labels.

In [0]:
# TODO: Construct X and Y

# TODO: drop column 'RainTomorrow' after saving it in y

# check how many samples there are for each class:
print("# of samples: {:d}".format(len(y)))
print("Percentage of days without rain on next day: {:.2f}".format((len(y) - np.sum(y)) / len(y)))
print("Percentage of days with rain on next day: {:.2f}".format(np.sum(y) / len(y)))

### ILV01.2 EDA

ILV01.2 a) Draw plots of your dataset to further examine its properties.

In [0]:
plt.figure(figsize=(20,10))
sns.boxplot(data=X)
sns.despine(offset=0,
            trim=True)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light' 
);

In [0]:
plt.figure(figsize=(20,10))
sns.violinplot(data=X)
sns.despine(offset=0,
            trim=True)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light' 
);

In [0]:
corr = X.corr()
plt.figure(figsize=(20,20))
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, annot=True)
sns.despine(offset=10, trim=True)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light' 
);

In [0]:
plt.figure(figsize=(20,20))
sns.pairplot(data=X[['MinTemp', 'Temp9am', 'Rainfall', 'Pressure9am']])
sns.despine(offset=0,
            trim=True)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light' 
);

ILV01.2 b) Based on the information presented in ILV01.2 a), decide which columns to drop to further reduce dimensionality of the dataset.

In [0]:
X = X.drop(columns=[
    #'col_1', 'col_2', ...
])

### ILV01.3 Sampling

ILV01.3 a) Split your dataset into a train set and test set to be able to train and evaluate a classifier.

In [0]:
from sklearn.model_selection import train_test_split

# TODO: Split the dataset into training and test samples

### ILV01.4 Training & Testing

ILV01.4 a) Train a Naive Bayes classifier using your train data.

In [0]:
from sklearn.naive_bayes import GaussianNB

# TODO: Train a Naive Bayes classifier

ILV01.4 a) Feed the testing data into the classifier and retrieve predictions.

In [0]:
# TODO: Predict the test data with the trained classifier

### ILV01.5 Evaluation

ILV01.5 a) Use different metrics to evaluate the training result.

In [0]:
from sklearn.metrics import accuracy_score

# TODO: 
# retrieve the total number of points
total_points = ...
# retrieve the number of mislabeled points
mislabeled_points = ...
# retrieve the accuracy score of your predictions
acc_score = ...

print("Number of mislabeled points out of a total {:d} points: {:d}".format(total_points, mislabeled_points))
print("This results in an accuracy score of {:.2f}".format(acc_score))

In [0]:
from sklearn.metrics import confusion_matrix

# TODO: Use sklearn.metrics.confusion_matrix to retrieve the confusion matrix 
# for your predictions and use the custom defined function plot_confusion_matrix(...) to print it


### ILV01.6 Back to the Drawing Board ...

ILV01.6 a) Revisit EDA & Feature Engineering to further improve your result! Try different settings (e.g. scale features, drop more columns, ... )

First off, let's try scaling each column with zscore by using sklearn's Standardscaler

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

# preserve structure of data as pandas dataframe by constructing new dataframe from scaled values, columns and index of X
X = pd.DataFrame(scaler.fit_transform(X.values), columns=X.columns, index=X.index)

In [0]:
plt.figure(figsize=(20,10))
sns.boxplot(data=X)
sns.despine(offset=0,
            trim=True)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light' 
);

Resample, retrain and repredict:

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=None, stratify=y)
gnb = GaussianNB()

gnb.fit(X_train, y_train);
y_pred = gnb.predict(X_test)

Reevaluate:

In [0]:
# TODO: 
# retrieve the total number of points
total_points = ...
# retrieve the number of mislabeled points
mislabeled_points = ...
# retrieve the accuracy score of your predictions
acc_score = ...

print("Number of mislabeled points out of a total {:d} points: {:d}".format(total_points, mislabeled_points))
print("This results in an accuracy score of {:.2f}".format(acc_score))

In [0]:
from sklearn.metrics import confusion_matrix

# TODO: Use sklearn.metrics.confusion_matrix to retrieve the confusion matrix 
# for your predictions and use the custom defined function plot_confusion_matrix(...) to print it


Now, let's try dropping insignificant features (use correlation heatmap to determine possible columns to drop):

In [0]:
corr = X.corr()
plt.figure(figsize=(20,20))
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, annot=True)
sns.despine(offset=10, trim=True)
plt.xticks(
    rotation=45, 
    horizontalalignment='right',
    fontweight='light' 
);

In [0]:
# TODO: Determine which columns you want to drop and add them to the following array
columns_to_drop = [
    ...
]

for col in columns_to_drop:
    if col in X.columns:
        X = X.drop(columns=[col])

Once more: resample, retrain, retest, reevaluate ...

In [0]:
# TODO: Split the dataset into training and test samples
X_train, X_test, y_train, y_test = ...

gnb = GaussianNB()

# TODO: Train the classifier with your data

# TODO: Use the trained classifier to call the predict function on the test set, save your results to y_pred
y_pred = ...

# TODO: 
# retrieve the total number of points
total_points = ...
# retrieve the number of mislabeled points
mislabeled_points = ...
# retrieve the accuracy score of your predictions
acc_score = ...

print("Number of mislabeled points out of a total {:d} points: {:d}".format(total_points, mislabeled_points))
print("This results in an accuracy score of {:.2f}".format(acc_score))

In [0]:
class_names = ['No Rain Tomorrow', 'Rain Tomorrow']

cm = confusion_matrix(y_test, y_pred)

plot_confusion_matrix(cm, class_names,
                          normalize=True,
                          title='Normalized Confusion matrix',
                          cmap=plt.cm.Blues)

Now we see that there is only so much we can do when it comes to optimizing features for a simple classifier such as Naive Bayes. Since we have to deal with the imbalance of classes in this dataset (see ILV01.1 h)), let's try to optimize for minimum risk instead.
Furthermore, let's say we want to make sure we identify rainy days with higher certainty. How would we do that?

### ILV01.7 Naive Bayes Minimum Risk Classifier

Multiplying the probabilities put out by a Naive Bayes classifier with a cost matrix leads to a Minimum Risk classifier.
With this type of classifier, one can optimize for distinct metrics: specificity and sensitivity (among others).

According to Duda et al:

Decide for $C_1$ if $(\lambda_{21} - \lambda_{11}) . P[C_1 | \vec{x}]$ > $(\lambda_{12} - \lambda_{22}) . P[C_2 | \vec{x}]$, otherwise decide for $C_2$

Cost matrix $\lambda$:
$\begin{bmatrix} 
0 & 1 \\
1 & 0 
\end{bmatrix}$ =
$\begin{bmatrix} 
TP & FP \\
FN & TN 
\end{bmatrix}$=
$\begin{bmatrix} 
(0,0) & (0,1) \\
(1,0) & (1,1) 
\end{bmatrix}$ $\bigg($ Duda et al.: $=
\begin{bmatrix} 
(1,1) & (1,2) \\
(2,1) & (2,2) 
\end{bmatrix}\bigg)$


ILV01.7 a) Let's say it is of great importance to correctly predict rainy days.

How would you have to design the cost matrix to achieve this goal?

In [0]:
gnb = GaussianNB()
gnb.fit(X_train, y_train);

predicted_probabilities = gnb.predict_proba(X_test)

cost_matrix = np.array([
    [0, 100],
    [1, 0]]
)
print('Cost Matrix:')
print(cost_matrix)

p_bayes_risk_rain = (cost_matrix[1, 0] - cost_matrix[0, 0]) * predicted_probabilities[:, 1]
p_bayes_risk_no_rain = abs(cost_matrix[0, 1] - cost_matrix[1, 1]) * predicted_probabilities[:, 0]

y_pred_minrisk = np.where(p_bayes_risk_rain > p_bayes_risk_no_rain, 1, 0)

acc_minrisk = accuracy_score(y_test, y_pred_minrisk)

print("Number of mislabeled points out of a total {:d} points: {:d}".format(X_test.shape[0], (y_test != y_pred_minrisk).sum()))
print("This results in an accuracy score of {:.2f}".format(acc_minrisk))

In [0]:
class_names = ['No Rain Tomorrow', 'Rain Tomorrow']

cm = confusion_matrix(y_test, y_pred_minrisk)

plot_confusion_matrix(cm, class_names,
                          normalize=True,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues)

### Conclusion

To conclude this first session, let's talk about the definition of specificity and sensitivity:

#### Sensitivity
Sensitivity (true positive rate (TPR)) refers to the ability to correctly predict rainy days as being rainy. A high sensitivity test is reliable when its result is negative, since it rarely predicts a day to be rainy that is not. A classifier with 100% sensitivity will recognize all rainy days in a given timeframe. Sensitivity by definition does not take into account false positives, hence it cannot be used to confidently rule out rain.

#### Specifity
Specificity (true negative rate (TNR)) measures the proportion of actual negatives that are correctly identified as such (e.g., the percentage of dry days who are correctly identified as having no rainfall). A positive result from a classifier with high specificity is useful to determine days that are certainly going to be wet. The classifier rarely gives predicts days as rainy that are going to be dry.