In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
DARK_READER = True
if DARK_READER:
    plt.rcParams.update({
        "lines.color": "white",
        "patch.edgecolor": "white",
        "text.color": "black",
        "axes.facecolor": "black",
        "axes.edgecolor": "lightgray",
        "axes.labelcolor": "white",
        "xtick.color": "white",
        "ytick.color": "white",
        "grid.color": "lightgray",
        "figure.facecolor": "black",
        "figure.edgecolor": "black",
        "savefig.facecolor": "black",
        "savefig.edgecolor": "black",
    })

# Season Dataset (Version 2)
In this version, we will have the `month` feature like in version 1. In addition to that, we have added a new `day` feature.
It is still a model for the seasons in Northern semisphere, and
we set the following rule:

- Spring: 1st March to end May (inclusive)
- Summer: 1st June to end August
- Autumn: 1st September to end November
- Winter: 1st December to end February

In [3]:
np.array([3, 31]) > np.array([7, 1])

array([False,  True])

We need to first construct a dataset. For convenience, we would like that arrays can be compared/ordered. Ndarrays cannot.

In [4]:
[3, 31] > [7, 1]

False

Let's make a list named `L_month_day` as follows:

```python
print(L_month_day)

[[1,1],
 [1,2],
 ...,
 [1,31],
 [2,1],
 ...,
 [2,28],
 ...,
 [12,31],
]
```

In [5]:
JAN = 1
FEB = 2
MAR = 3
APR = 4
MAY = 5
JUN = 6
JUL = 7
AUG = 8
SEP = 9
OCT = 10
NOV = 11
DEC = 12

SPRING = 0
SUMMER = 1
AUTUMN = 2
WINTER = 3

season_to_id = {
    "spring": 0,
    "summer": 1,
    "autumn": 2,
    "winter": 3,
}

SEED = 20


I don't know what happened to me last week, but the above way to access the number of days in some month is less efficient than simply using a dictionary.

In [6]:
ndays_big = 31
ndays_small = 30
ndays_feb = 28

D_month_ndays = {
    JAN: ndays_big,
    FEB: ndays_feb,
    MAR: ndays_big,
    APR: ndays_small,
    MAY: ndays_big,
    JUN: ndays_small,
    JUL: ndays_big,
    AUG: ndays_big, 
    SEP: ndays_small,
    OCT: ndays_big,
    NOV: ndays_small,
    DEC: ndays_big,
}

In [7]:
# Similarly, we verify if there are 365 days in a year
sum(D_month_ndays.values())

365

In [8]:
L_month_day = []
for month in (JAN, FEB, MAR, APR, MAY, JUN, JUL, AUG, SEP, OCT, NOV, DEC,):
    #L = [[month, i] for i in range(1, n_days_in(month)+1)]
    L = [[month, i] for i in range(1, D_month_ndays[month]+1)]
    L_month_day.extend(L)

In [9]:
#i = np.random.randint(JAN, DEC+1)
#L_month_day[-31:]
L_month_day[:31]

[[1, 1],
 [1, 2],
 [1, 3],
 [1, 4],
 [1, 5],
 [1, 6],
 [1, 7],
 [1, 8],
 [1, 9],
 [1, 10],
 [1, 11],
 [1, 12],
 [1, 13],
 [1, 14],
 [1, 15],
 [1, 16],
 [1, 17],
 [1, 18],
 [1, 19],
 [1, 20],
 [1, 21],
 [1, 22],
 [1, 23],
 [1, 24],
 [1, 25],
 [1, 26],
 [1, 27],
 [1, 28],
 [1, 29],
 [1, 30],
 [1, 31]]

In [10]:
len(L_month_day)

365

In [11]:
y_rule1 = []
for month, _ in L_month_day:
    if MAR <= month <= MAY:
        y_rule1.append(SPRING)
    elif JUN <= month <= AUG:
        y_rule1.append(SUMMER)
    elif SEP <= month <= NOV:
        y_rule1.append(AUTUMN)
    else:
        y_rule1.append(WINTER)
y_rule1[0:-1:31]

[3, 3, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3]

In [12]:
X = np.array(L_month_day)
X.dtype, X.shape

(dtype('int64'), (365, 2))

## How to Split Train/Test sets?

In [13]:
from sklearn.model_selection import StratifiedShuffleSplit

**(?)** What is the `n_splits` for?<br>

In [14]:
y_rule1 = np.array(y_rule1)

In [15]:
split = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=SEED)
n_iterations = 0
for train_indices, test_indices in split.split(X, X[:, 0]):
    n_iterations += 1
    X_train_raw = X[train_indices, :]
    X_test_raw = X[test_indices, :]
    y_rule1_train = y_rule1[train_indices]
    y_rule1_test = y_rule1[test_indices]

**(?)** Did you notice that the syntax for the `split.split` above is somewhat diff from what we are used to?

In [16]:
n_iterations

10

In [None]:
np.array_equal(X_train_raw, X[train_indices, :])

In [None]:
test_indices

In [None]:
test_indices.shape

In [None]:
int(365 * 0.2)

In [None]:
X_train_raw.shape, X_test_raw.shape

In [None]:
import pandas as pd

In [None]:
df_X_test_raw = pd.DataFrame(X_test_raw, columns=["month", "day",])
df_X_test_raw

In [None]:
df_X_test_raw["month"].value_counts()

In [None]:
df_X = pd.DataFrame(X, columns=["month", "day"])
df_X["month"].value_counts()

In [None]:
df_Xy_test_raw = pd.DataFrame(np.c_[X_test_raw, y_rule1_test], columns=["month", "day", "season_id"])
df_Xy_test_raw

## First Dataset, First Model
Just take `X_train_raw` and `X_test_raw` to train a few ML models and see what that gives.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
log_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()
#svm_clf = SVC(probability=True)
svm_clf = SVC()
#rnd_clf = RandomForestClassifier()
T_classifiers = (log_clf, tree_clf, svm_clf)

In [None]:
for clf in T_classifiers:
    clf.fit(X_train_raw, y_rule1_train)
    y_pred = clf.predict(X_test_raw)
    
    acc = accuracy_score(y_rule1_test, y_pred)

    ## ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
    #precision = precision_score(y_rule1_test, y_pred)
    ## TypeError: unsupported format string passed to numpy.ndarray.__format__
    #precision = precision_score(y_rule1_test, y_pred, average=None)
    precision = precision_score(y_rule1_test, y_pred, average="weighted")

    #recall = recall_score(y_rule1_test, y_pred)
    #recall = recall_score(y_rule1_test, y_pred, average=None)
    recall = recall_score(y_rule1_test, y_pred, average="weighted")

    print(f"({clf.__class__.__name__})\nacc = {acc:.2f}, precision = {precision:.2f}, recall = {recall:.2f}\n")


In [None]:
# What about performance on the training data?
for clf in T_classifiers:
    clf.fit(X_train_raw, y_rule1_train)
    y_pred = clf.predict(X_train_raw)
    acc = accuracy_score(y_rule1_train, y_pred)
    precision = precision_score(y_rule1_train, y_pred, average="micro")
    recall = recall_score(y_rule1_train, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.2f}, precision = {precision:.2f}, recall = {recall:.2f}\n")

We see that

- Random forest performances perfectly. So does decision tree.

Maybe the task is too simple. Even without taking the cyclic nature into consideration, some of the classifiers
can already reach near perfect performance.


# Season Dataset (Version 3)
In this version, we will have exactly the same features like in version 2.<br>
Only that we set a diff rule to increase the difficulty:

- Spring: 15th March to 14th June (inclusive)
- Summer: 15th June to 14th September
- Autumn: 15th September to 14th December
- Winter: 15th December to 14th March

**N.B.** Note that we just need to create a new label, `y_rule2`. No need to modify `X`.

In [None]:
y_rule2 = []
start = 15
end = 14
for month_day in L_month_day:
    if [MAR,start] <= month_day <= [JUN,end]:
        y_rule2.append(SPRING)
    elif [JUN,start] <= month_day <= [SEP,end]:
        y_rule2.append(SUMMER)
    elif [SEP,start] <= month_day <= [DEC,end]:
        y_rule2.append(AUTUMN)
    else:
        y_rule2.append(WINTER)

y_rule2[0:-1:31]

### Let's verify the correctness

In [None]:
np.c_[L_month_day, y_rule2][30:-1]

In [None]:
np.c_[L_month_day, y_rule2][31:-1]

In [None]:
y_rule2 = np.array(y_rule2)

In [None]:
y_rule2_train = y_rule2[train_indices]
y_rule2_test = y_rule2[test_indices]

In [None]:
for clf in T_classifiers:
    clf.fit(X_train_raw, y_rule2_train)
    y_pred = clf.predict(X_test_raw)
    acc = accuracy_score(y_rule2_test, y_pred)
    precision = precision_score(y_rule2_test, y_pred, average="micro")
    recall = recall_score(y_rule2_test, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")

In this case, our <s>random forest</s> and decision tree classifies are no longer perfect (even though still perform with high accuracy).

In [None]:
# On training set
for clf in T_classifiers:
    y_pred = clf.predict(X_train_raw)
    acc = accuracy_score(y_rule2_train, y_pred)
    precision = precision_score(y_rule2_train, y_pred, average="micro")
    recall = recall_score(y_rule2_train, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")

Let's see whether incorporating the cyclic nature improves the situation.

**(?)** It seems that `ageron` once said in his book that `svm_clf` and a few other classifiers cannot do multiclass classification. But if they really
cannot, and that we have misused them here, why the metrics shown above are not low?<br>
**(R)** If you read carefully Chapter3 on the same section of multiclass classification, `ageron` has explained that these classifiers are indeed only capable of doing binary classification; however, the reason why we can still use these classes in `sklearn` to do multiclass classification is that behind the scene `sklearn` has implemented for us OneVsAll or OneVsOne.

## Cyclic Nature

In [None]:
!cat utils.py

In [None]:
from utils import *

In [None]:
cyclicize_series(X[:, 0], max_=12, min_=0)

In [None]:
X_cyclic = np.c_[
    cyclicize_series(X[:, 0], max_=12, min_=0),
    cyclicize_series(X[:, 1], max_=31, min_=0),
]
X_cyclic

In [None]:
X_train_cyclic = X_cyclic[train_indices]
X_test_cyclic = X_cyclic[test_indices]

In [None]:
for clf in T_classifiers:
    clf.fit(X_train_cyclic, y_rule2_train)
    y_pred = clf.predict(X_test_cyclic)
    acc = accuracy_score(y_rule2_test, y_pred)
    precision = precision_score(y_rule2_test, y_pred, average="micro")
    recall = recall_score(y_rule2_test, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")

In [None]:
# On training set
for clf in T_classifiers:
    y_pred = clf.predict(X_train_cyclic)
    acc = accuracy_score(y_rule2_train, y_pred)
    precision = precision_score(y_rule2_train, y_pred, average="micro")
    recall = recall_score(y_rule2_train, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")