<a href="https://colab.research.google.com/github/maleehahassan/HIDA_Into_to_DL/blob/main/10_feedforward_squirrels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [198]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

sns.set_theme(style='white')

In [2]:
# %matplotlib inline

First, download a dataset about squirrel sightings in New York City.

In [3]:
# simply using pandas
url = 'https://data.cityofnewyork.us/resource/vfnx-vebw.csv?$limit=5000'
df = pd.read_csv(url)
df.shape

(3023, 31)

In [4]:
df.head()

Unnamed: 0,x,y,unique_squirrel_id,hectare,shift,date,hectare_squirrel_number,age,primary_fur_color,highlight_fur_color,...,kuks,quaas,moans,tail_flags,tail_twitches,approaches,indifferent,runs_from,other_interactions,geocoded_column
0,-73.956134,40.794082,37F-PM-1014-03,37F,PM,10142018,3,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9561344937861 40.7940823884086)
1,-73.968857,40.783783,21B-AM-1019-04,21B,AM,10192018,4,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9688574691102 40.7837825208444)
2,-73.974281,40.775534,11B-PM-1014-08,11B,PM,10142018,8,,Gray,,...,False,False,False,False,False,False,False,False,,POINT (-73.97428114848522 40.775533619083)
3,-73.959641,40.790313,32E-PM-1017-14,32E,PM,10172018,14,Adult,Gray,,...,False,False,False,False,False,False,False,True,,POINT (-73.9596413903948 40.7903128889029)
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)


In [5]:
df.describe(include='all')

Unnamed: 0,x,y,unique_squirrel_id,hectare,shift,date,hectare_squirrel_number,age,primary_fur_color,highlight_fur_color,...,kuks,quaas,moans,tail_flags,tail_twitches,approaches,indifferent,runs_from,other_interactions,geocoded_column
count,3023.0,3023.0,3023,3023,3023,3023.0,3023.0,2902,2968,1937,...,3023,3023,3023,3023,3023,3023,3023,3023,240,3023
unique,,,3018,339,2,,,3,3,10,...,2,2,2,2,2,2,2,2,197,3023
top,,,7D-PM-1010-01,14D,PM,,,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,runs from (dog),POINT (-73.9561344937861 40.7940823884086)
freq,,,2,32,1676,,,2568,2473,767,...,2921,2973,3020,2868,2589,2845,1569,2345,9,1
mean,-73.967184,40.780853,,,,10119490.0,4.123718,,,,...,,,,,,,,,,
std,0.007726,0.010285,,,,42466.71,3.096492,,,,...,,,,,,,,,,
min,-73.981159,40.764911,,,,10062020.0,1.0,,,,...,,,,,,,,,,
25%,-73.973102,40.771676,,,,10082020.0,2.0,,,,...,,,,,,,,,,
50%,-73.968594,40.778166,,,,10122020.0,3.0,,,,...,,,,,,,,,,
75%,-73.960189,40.791219,,,,10142020.0,6.0,,,,...,,,,,,,,,,


## Exploratory data analysis
Inspect the dataset some more, using in particular `dtypes` or `describe`. Find out how often the different behaviours were observed!

In [None]:
# using dtypes

In [188]:
# using pandas describe

If you want, you can draw a `pairplot` – but it will be very big!

Draw sighting locations using `sns.relplot` or `sns.jointplot`, then explore the distributions of a few interesting features and the `hue` parameter.

In [None]:
# the primary_fur_color is probably interesting
sns.relplot(data=df,
            ...,
            )

In [None]:
# or the shift


In [None]:
# a jointplot also shows the marginal distributions
sns.jointplot(
    ...
              )

## Feature Cleaning / Encoding
This dataset definitely needs some cleaning. It has a mix of categorical, boolean and numerical values; a lot of `NA`s and some string columns with rare annotation.

In [218]:
def prepare_dataset(df: pd.DataFrame,
                    target: str = 'primary_fur_color',
                    ) -> tuple[np.array, np.array, list[str], OneHotEncoder]:
    # make a copy so we don't modify the original dataframe'
    df = df.copy()

    if target not in df.columns:
        raise ValueError(f'column {target} not found in dataset!')

    # drop rows where the target column isn't defined
    df = df.dropna(subset=[target])
    print(f'{len(df)} rows remaining')

    # not all columns can be used as training features.
    # we also treat categoricals and booleans differently.
    cat_cols = ['age', 'hectare', 'shift']
    bool_cols = [
        'running', 'chasing', 'climbing', 'eating', 'foraging',
        'kuks', 'quaas', 'moans', 'tail_flags', 'tail_twitches',
        'approaches', 'indifferent', 'runs_from',
    ]

    ok_features = [c for c in [
        'hectare', 'shift', 'primary_fur_color',
        'highlight_fur_color', 'specific_location',
        'running', 'chasing', 'climbing', 'eating',
        'foraging', 'kuks', 'quaas', 'moans',
        'tail_flags', 'tail_twitches', 'approaches',
        'indifferent', 'runs_from',
    ] if c in df.columns]

    # Keep only available columns
    cat_cols = [c for c in cat_cols if c in df.columns and c != target]
    bool_cols = [c for c in bool_cols if c in df.columns and c != target]

    # fill booleans with False, then convert to int
    df[bool_cols] = df[bool_cols].fillna(False).astype(int)

    # encoder for categorical features
    preprocessor = ColumnTransformer(transformers=[
        ('bools', 'passthrough', bool_cols),
        ('cats', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

    # encode and transform features
    X = preprocessor.fit_transform(df)

    # encode the target column
    y_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    y = y_encoder.fit_transform(df[[target]])

    return X, y, feature_names, y_encoder
    

In [220]:
X, y, feature_names, y_encoder = prepare_dataset(df, target='primary_fur_color')  # 'primary_fur_color' 'shift', 'age'

2968 rows remaining


In [222]:
# finally make a train-test-split
X_train, X_test, y_train, y_test = train_test_split(
    ...)

print(f'train: {X_train.shape}\ntest:  {X_test.shape}')
print('Classes:', y_encoder.categories_[0])

train: (2522, 357)
test:  (446, 357)
Classes: ['Black' 'Cinnamon' 'Gray']


### Reproducibility

In [223]:
from numpy.random import seed
from tensorflow.random import set_seed
seed(1)
set_seed(2)
sns.set_theme(style='white')

## Predict Squirrel Fur Color

In [None]:
# this function sets up the model. You can make it as complex or simple as you want. In particular, feel free to try dropout, or try different sizes.
def make_multiclass_model(input_shape, n_classes) -> keras.Model:
    ...

# once your function is finished, you can build your model:
model = make_multiclass_model((X_train.shape[1],), y_train.shape[1])
# now compile it with the correct loss and optimizer:
model.compile(
    ...
              )
model.summary()

In [None]:
# train the model
history = model.fit(...,
                    validation_split=.2,
                    verbose=2)

y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [None]:
# turn the history into a DataFrame for easier analysis
history_df = pd.DataFrame.from_dict(history.history)
history_df.head()

In [None]:
# plot the training and validation loss and accuracy
fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(6, 3))

sns.lineplot(..., ax=ax0)
ax0.set_xlabel('epochs')
sns.lineplot(..., ax=ax1)
plt.tight_layout()


In [229]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
f'test loss: {test_loss:.2f}, test accuracy {test_accuracy:.2f}'

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8072 - loss: 1.8044 


'test loss: 1.80, test accuracy 0.81'

In [230]:
y_pred = model.predict(X_test)
prediction = y_encoder.inverse_transform(y_pred)
ground_truth = y_encoder.inverse_transform(y_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [231]:
cm = confusion_matrix(ground_truth, prediction)
print(cm)

[[  0   0  15]
 [  0  14  45]
 [  8  18 346]]


In [232]:
# To confirm which axis is the truth and which is the prediction
np.unique(ground_truth, return_counts=True)

(array(['Black', 'Cinnamon', 'Gray'], dtype=object), array([ 15,  59, 372]))

In [None]:
ax = sns.heatmap(cm, annot=True, square=True,
            yticklabels=y_encoder.categories_[0],
            xticklabels=y_encoder.categories_[0],
            )
ax.set(xlabel='prediction', ylabel='ground truth')

## Extra Task: Predict `shift`
This means whether the squirrel was seen before or after noon.

## Extra Task: Feature Engineering
Build a feature that indicates whether a squirrel will accept and eat a nut from you:
It `approaches` or `forages`, or `eating` already. Also, it is not `indifferent` or `runs_from` you. Then build a predictor for this feature!

## Extra Task: Regression
Build a synthetic feature that encodes how chatty a squirrel is. For that, combine `kuks`, `moans` and `quaas`.