In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
DARK_READER = True
if DARK_READER:
    plt.rcParams.update({
        "lines.color": "white",
        "patch.edgecolor": "white",
        "text.color": "black",
        "axes.facecolor": "black",
        "axes.edgecolor": "lightgray",
        "axes.labelcolor": "white",
        "xtick.color": "white",
        "ytick.color": "white",
        "grid.color": "lightgray",
        "figure.facecolor": "black",
        "figure.edgecolor": "black",
        "savefig.facecolor": "black",
        "savefig.edgecolor": "black",
    })

# Season Dataset (Version 2)
In this version, we will have the `month` feature like in version 1. In addition to that, we have added a new `day` feature.
It is still a model for the seasons in Northern semisphere, and
we set the following rule:

- Spring: 1st March to end May (inclusive)
- Summer: 1st June to end August
- Autumn: 1st September to end November
- Winter: 1st December to end February

In [3]:
import numpy as np

In [4]:
np.array([3, 31]) > np.array([7, 1])

array([False,  True])

In [5]:
[3, 31] > [7, 1]

False

Let's make a list named `L_month_day` as follows:

```python
print(L_month_day)

[[1,1],
 [1,2],
 ...,
 [1,31],
 [2,1],
 ...,
 [2,28],
 ...,
 [12,31],
]
```

In [6]:
JAN = 1
FEB = 2
MAR = 3
APR = 4
MAY = 5
JUN = 6
JUL = 7
AUG = 8
SEP = 9
OCT = 10
NOV = 11
DEC = 12

SPRING = 0
SUMMER = 1
AUTUMN = 2
WINTER = 3

season_to_id = {
    "spring": 0,
    "summer": 1,
    "autumn": 2,
    "winter": 3,
}

SEED = 20



In [7]:
D_month_BigMonth = {
    JAN: 1,
    FEB: -1,
    MAR: 1,
    APR: 0,
    MAY: 1,
    JUN: 0,
    JUL: 1,
    AUG: 1, 
    SEP: 0,
    OCT: 1,
    NOV: 0,
    DEC: 1,
}

# Verify if this gives 365 days in one year.
somme = 0
for indicator in D_month_BigMonth.values():
    if indicator == 1:
        somme += 31
    elif indicator == 0:
        somme += 30
    elif indicator == -1:
        somme += 28
somme

365

In [8]:
def n_days_in(month):
    indicator = D_month_BigMonth[month]
    if indicator == 1:
        return 31
    elif indicator == 0:
        return 30
    elif indicator == -1:
        return 28

In [9]:
L_month_day = []
for month in (JAN, FEB, MAR, APR, MAY, JUN, JUL, AUG, SEP, OCT, NOV, DEC,):
    L = [[month, i] for i in range(1, n_days_in(month)+1)]
    L_month_day.extend(L)

In [10]:
#i = np.random.randint(JAN, DEC+1)
#L_month_day[-31:]
L_month_day[:31]

[[1, 1],
 [1, 2],
 [1, 3],
 [1, 4],
 [1, 5],
 [1, 6],
 [1, 7],
 [1, 8],
 [1, 9],
 [1, 10],
 [1, 11],
 [1, 12],
 [1, 13],
 [1, 14],
 [1, 15],
 [1, 16],
 [1, 17],
 [1, 18],
 [1, 19],
 [1, 20],
 [1, 21],
 [1, 22],
 [1, 23],
 [1, 24],
 [1, 25],
 [1, 26],
 [1, 27],
 [1, 28],
 [1, 29],
 [1, 30],
 [1, 31]]

In [11]:
len(L_month_day)

365

In [12]:
y_rule1 = []
for month, _ in L_month_day:
    if MAR <= month <= MAY:
        y_rule1.append(SPRING)
    elif JUN <= month <= AUG:
        y_rule1.append(SUMMER)
    elif SEP <= month <= NOV:
        y_rule1.append(AUTUMN)
    else:
        y_rule1.append(WINTER)
y_rule1[0:-1:31]

[3, 3, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3]

In [13]:
X = np.array(L_month_day)
X.dtype, X.shape

(dtype('int64'), (365, 2))

## How to Split Train/Test sets?

In [14]:
from sklearn.model_selection import StratifiedShuffleSplit

**(?)** What is the `n_splits` for?<br>

In [19]:
y_rule1 = np.array(y_rule1)

In [20]:
split = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=SEED)
for train_indices, test_indices in split.split(X, X[:, 0]):
    X_train_raw = X[train_indices, :]
    X_test_raw = X[test_indices, :]
    y_rule1_train = y_rule1[train_indices]
    y_rule1_test = y_rule1[test_indices]

**(?)** Did you notice that the syntax for the `split.split` above is somewhat diff from what we are used to?

In [30]:
test_indices

array([ 95, 255, 146, 339, 152,  93, 349,  27, 264, 127,  87, 136, 260,
        36, 156, 230,  50, 253, 257, 205, 222,  73, 211, 238,  19, 128,
       358, 319, 299, 101, 111, 302,   5, 213, 310, 151, 174,  45, 321,
       138, 282,  43,  20, 194, 229, 361, 326, 242, 313, 274, 108, 208,
        57,  76, 132, 116, 311, 297, 240, 179, 280,  10, 243, 209, 184,
        79, 173, 350,  37,  63,  74, 344,   6])

In [31]:
test_indices.shape

(73,)

In [32]:
int(365 * 0.2)

73

In [22]:
X_train_raw.shape, X_test_raw.shape

((292, 2), (73, 2))

In [23]:
import pandas as pd

In [26]:
df_X_test_raw = pd.DataFrame(X_test_raw, columns=["month", "day",])
df_X_test_raw

Unnamed: 0,month,day
0,4,6
1,9,13
2,5,27
3,12,6
4,6,2
...,...,...
68,2,7
69,3,5
70,3,16
71,12,11


In [27]:
df_X_test_raw["month"].value_counts()

8     7
1     6
2     6
3     6
4     6
5     6
6     6
7     6
9     6
10    6
11    6
12    6
Name: month, dtype: int64

In [28]:
df_X = pd.DataFrame(X, columns=["month", "day"])
df_X["month"].value_counts()

1     31
3     31
5     31
7     31
8     31
10    31
12    31
4     30
6     30
9     30
11    30
2     28
Name: month, dtype: int64

In [29]:
df_Xy_test_raw = pd.DataFrame(np.c_[X_test_raw, y_rule1_test], columns=["month", "day", "season_id"])
df_Xy_test_raw

Unnamed: 0,month,day,season_id
0,4,6,0
1,9,13,2
2,5,27,0
3,12,6,3
4,6,2,1
...,...,...,...
68,2,7,3
69,3,5,0
70,3,16,0
71,12,11,3


## First Dataset, First Model
Just take `X_train_raw` and `X_test_raw` to train a few ML models and see what that gives.

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score

In [34]:
log_clf = LogisticRegression()
svm_clf = SVC(probability=True)
rnd_clf = RandomForestClassifier()

In [35]:
voting_clf = VotingClassifier(
    estimators=[("lr", log_clf), ("svm", svm_clf), ("rf", rnd_clf)],
    voting="soft",
)

In [36]:
for clf in (log_clf, svm_clf, rnd_clf, voting_clf):
    clf.fit(X_train_raw, y_rule1_train)
    y_pred = clf.predict(X_test_raw)
    acc = accuracy_score(y_rule1_test, y_pred)
    precision = precision_score(y_rule1_test, y_pred, average="micro")
    recall = recall_score(y_rule1_test, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.2f}, precision = {precision:.2f}, recall = {recall:.2f}\n")

(LogisticRegression)
acc = 0.64, precision = 0.64, recall = 0.64

(SVC)
acc = 0.82, precision = 0.82, recall = 0.82

(RandomForestClassifier)
acc = 1.00, precision = 1.00, recall = 1.00

(VotingClassifier)
acc = 0.92, precision = 0.92, recall = 0.92



In [42]:
for clf in (log_clf, svm_clf, rnd_clf, voting_clf):
    clf.fit(X_train_raw, y_rule1_train)
    y_pred = clf.predict(X_train_raw)
    acc = accuracy_score(y_rule1_train, y_pred)
    precision = precision_score(y_rule1_train, y_pred, average="micro")
    recall = recall_score(y_rule1_train, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.2f}, precision = {precision:.2f}, recall = {recall:.2f}\n")

(LogisticRegression)
acc = 0.64, precision = 0.64, recall = 0.64

(SVC)
acc = 0.88, precision = 0.88, recall = 0.88

(RandomForestClassifier)
acc = 1.00, precision = 1.00, recall = 1.00

(VotingClassifier)
acc = 0.91, precision = 0.91, recall = 0.91



We see that

- Random forest performances perfectly
- Voting classifier's performance has been averaged and thus a little lower than random forest

Maybe the task is too simple. Even without taking the cyclic nature into consideration, some of the classifiers
can already reach near perfect performance.


## Second Dataset, Same Models
Let's standardize the features and then just apply the same model to see if things will get better.

> The situation after standardization is **not getting better**.

**(?)** How come `acc, precision, recall` all have the same value for each classifier? Did you do it wrong or sth?

## Third Dataset, Same Models
Use sine and cosine to try to make the points more separable. This is actually more like increasing the number of features to increase the possibility to separate the data points geometrically!

In [52]:
def sinusoid(X_batch, period):
    """
    args
        theta, float
            e.g. DEC gives 2pi and JAN gives 2pi / 12
    """
    theta = 2 * np.pi * (X_batch / period)
    return np.c_[np.cos(theta), np.sin(theta)]

In [55]:
X_month_sinusoid = sinusoid(X[:,0], period=12)
X_month_sinusoid.shape, X_month_sinusoid.dtype

((365, 2), dtype('float64'))

In [56]:
X_day_sinusoid = np.empty_like(X_month_sinusoid)
X_day_sinusoid.shape, X_day_sinusoid.dtype

((365, 2), dtype('float64'))

In [60]:
start = 0
for month in (JAN, FEB, MAR, APR, MAY, JUN, JUL, AUG, SEP, OCT, NOV, DEC):
    n_days = n_days_in(month)
    X_day_sinusoid[start:start+n_days, :] = sinusoid(X[start:start+n_days, 1], period=n_days)
    start += n_days

X_day_sinusoid[:n_days_in(JAN)]

array([[ 9.79529941e-01,  2.01298520e-01],
       [ 9.18957812e-01,  3.94355855e-01],
       [ 8.20763441e-01,  5.71268215e-01],
       [ 6.88966919e-01,  7.24792787e-01],
       [ 5.28964010e-01,  8.48644257e-01],
       [ 3.47305253e-01,  9.37752132e-01],
       [ 1.51427778e-01,  9.88468324e-01],
       [-5.06491688e-02,  9.98716507e-01],
       [-2.50652532e-01,  9.68077119e-01],
       [-4.40394152e-01,  8.97804540e-01],
       [-6.12105983e-01,  7.90775737e-01],
       [-7.58758123e-01,  6.51372483e-01],
       [-8.74346616e-01,  4.85301963e-01],
       [-9.54139256e-01,  2.99363123e-01],
       [-9.94869323e-01,  1.01168322e-01],
       [-9.94869323e-01, -1.01168322e-01],
       [-9.54139256e-01, -2.99363123e-01],
       [-8.74346616e-01, -4.85301963e-01],
       [-7.58758123e-01, -6.51372483e-01],
       [-6.12105983e-01, -7.90775737e-01],
       [-4.40394152e-01, -8.97804540e-01],
       [-2.50652532e-01, -9.68077119e-01],
       [-5.06491688e-02, -9.98716507e-01],
       [ 1.

In [63]:
print(np.c_[X_day_sinusoid[:n_days_in(JAN)], X_day_sinusoid[31+28:31+28+n_days_in(MAR)]])

[[ 9.79529941e-01  2.01298520e-01  9.79529941e-01  2.01298520e-01]
 [ 9.18957812e-01  3.94355855e-01  9.18957812e-01  3.94355855e-01]
 [ 8.20763441e-01  5.71268215e-01  8.20763441e-01  5.71268215e-01]
 [ 6.88966919e-01  7.24792787e-01  6.88966919e-01  7.24792787e-01]
 [ 5.28964010e-01  8.48644257e-01  5.28964010e-01  8.48644257e-01]
 [ 3.47305253e-01  9.37752132e-01  3.47305253e-01  9.37752132e-01]
 [ 1.51427778e-01  9.88468324e-01  1.51427778e-01  9.88468324e-01]
 [-5.06491688e-02  9.98716507e-01 -5.06491688e-02  9.98716507e-01]
 [-2.50652532e-01  9.68077119e-01 -2.50652532e-01  9.68077119e-01]
 [-4.40394152e-01  8.97804540e-01 -4.40394152e-01  8.97804540e-01]
 [-6.12105983e-01  7.90775737e-01 -6.12105983e-01  7.90775737e-01]
 [-7.58758123e-01  6.51372483e-01 -7.58758123e-01  6.51372483e-01]
 [-8.74346616e-01  4.85301963e-01 -8.74346616e-01  4.85301963e-01]
 [-9.54139256e-01  2.99363123e-01 -9.54139256e-01  2.99363123e-01]
 [-9.94869323e-01  1.01168322e-01 -9.94869323e-01  1.01168322e

In [71]:
np.array_equal(X_day_sinusoid[:n_days_in(JAN)], X_day_sinusoid[31+28:31+28+n_days_in(MAR)])

True

In [72]:
[s for s in dir(np) if s.startswith("array_")]

['array_equal', 'array_equiv', 'array_repr', 'array_split', 'array_str']

In [73]:
np.allclose(X_day_sinusoid[:n_days_in(JAN)], X_day_sinusoid[31+28:31+28+n_days_in(MAR)])

True

In [74]:
X_sinusoid = np.c_[X_month_sinusoid, X_day_sinusoid]
X_sinusoid_train = X_sinusoid[train_indices]
X_sinusoid_test = X_sinusoid[test_indices]
X_sinusoid_train.shape, X_sinusoid_test.shape

((292, 4), (73, 4))

In [75]:
for clf in (log_clf, svm_clf, rnd_clf, voting_clf):
    clf.fit(X_sinusoid_train, y_rule2_train)
    y_pred = clf.predict(X_sinusoid_test)
    acc = accuracy_score(y_rule2_test, y_pred)
    precision = precision_score(y_rule2_test, y_pred, average="micro")
    recall = recall_score(y_rule2_test, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.2f}, precision = {precision:.2f}, recall = {recall:.2f}\n")

(LogisticRegression)
acc = 0.81, precision = 0.81, recall = 0.81

(SVC)
acc = 0.95, precision = 0.95, recall = 0.95

(RandomForestClassifier)
acc = 0.99, precision = 0.99, recall = 0.99

(VotingClassifier)
acc = 0.97, precision = 0.97, recall = 0.97



In [None]:
X_train

In [None]:
X_train_sinusoid = sinusoid(X_train)
X_train_sinusoid

In [None]:
plt.plot(X_train_sinusoid[:, 0], X_train_sinusoid[:, 1], "ro");
t = np.linspace(0, 2*np.pi, 100)
plt.plot(np.cos(t), np.sin(t))
plt.gca().set_aspect('equal', adjustable='box')
plt.grid(True);

In [None]:
X_test_sinusoid = sinusoid(X_test)
X_test_sinusoid

In [None]:
for clf in (log_clf, svm_clf, rnd_clf, voting_clf):
    clf.fit(X_train_sinusoid, y_train)
    y_pred = clf.predict(X_test_sinusoid)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")
    print(f"({clf.__class__.__name__})\nacc = {acc:.2f}, precision = {precision:.2f}, recall = {recall:.2f}\n")

What? Even worse than the naive and worst dataset from the beginning?

## Maybe...
There are just too little data for this to make sense. Too little data to do ML, maybe.

This is actually funny: I thought this was going to be the easiest among the notebooks I was going to create, the models' performance all really good, etc. etc. But quite the contrary.

We have only `8` training instances and would like to predict `4` test instances, doing a multiclass classification. I think of a stupid test which I saw a few years ago:

> Given a sequence `2, 3, 5`, please guess what numbers will follow.

This is not a good test because there can be many, many interesting answers and might not have **the right answer**. For example, it could be the sequence of primes, or the subsequence of fibonacci numbers starting from `2`. So I was so stupid as not to have designed this notebook for testing whether or not the method of spreading the cyclic data on a circle would be effective. In some sense, data science needs a sufficient amount of data to extract some kind of pattern that human might judge as useful to them.


# Season Dataset (Version 3)
In this version, we will have exactly the same features like in version 2.<br>
Only that we set a diff rule to increase the difficulty:

- Spring: 15th March to 14th June (inclusive)
- Summer: 15th June to 14th September
- Autumn: 15th September to 14th December
- Winter: 15th December to 14th March


In [43]:
y_rule2 = []
start = 15
end = 14
for month_day in L_month_day:
    if [MAR,start] <= month_day <= [JUN,end]:
        y_rule2.append(SPRING)
    elif [JUN,start] <= month_day <= [SEP,end]:
        y_rule2.append(SUMMER)
    elif [SEP,start] <= month_day <= [DEC,end]:
        y_rule2.append(AUTUMN)
    else:
        y_rule2.append(WINTER)

y_rule2[0:-1:31]

[3, 3, 3, 0, 0, 0, 1, 1, 1, 2, 2, 2]

### Let's verify the correctness

In [44]:
np.c_[L_month_day, y_rule2][30:-1]

array([[ 1, 31,  3],
       [ 2,  1,  3],
       [ 2,  2,  3],
       ...,
       [12, 28,  3],
       [12, 29,  3],
       [12, 30,  3]])

In [45]:
np.c_[L_month_day, y_rule2][31:-1]

array([[ 2,  1,  3],
       [ 2,  2,  3],
       [ 2,  3,  3],
       [ 2,  4,  3],
       [ 2,  5,  3],
       [ 2,  6,  3],
       [ 2,  7,  3],
       [ 2,  8,  3],
       [ 2,  9,  3],
       [ 2, 10,  3],
       [ 2, 11,  3],
       [ 2, 12,  3],
       [ 2, 13,  3],
       [ 2, 14,  3],
       [ 2, 15,  3],
       [ 2, 16,  3],
       [ 2, 17,  3],
       [ 2, 18,  3],
       [ 2, 19,  3],
       [ 2, 20,  3],
       [ 2, 21,  3],
       [ 2, 22,  3],
       [ 2, 23,  3],
       [ 2, 24,  3],
       [ 2, 25,  3],
       [ 2, 26,  3],
       [ 2, 27,  3],
       [ 2, 28,  3],
       [ 3,  1,  3],
       [ 3,  2,  3],
       [ 3,  3,  3],
       [ 3,  4,  3],
       [ 3,  5,  3],
       [ 3,  6,  3],
       [ 3,  7,  3],
       [ 3,  8,  3],
       [ 3,  9,  3],
       [ 3, 10,  3],
       [ 3, 11,  3],
       [ 3, 12,  3],
       [ 3, 13,  3],
       [ 3, 14,  3],
       [ 3, 15,  0],
       [ 3, 16,  0],
       [ 3, 17,  0],
       [ 3, 18,  0],
       [ 3, 19,  0],
       [ 3, 2

In [47]:
y_rule2 = np.array(y_rule2)

In [48]:
y_rule2_train = y_rule2[train_indices]
y_rule2_test = y_rule2[test_indices]

In [49]:
for clf in (log_clf, svm_clf, rnd_clf, voting_clf):
    clf.fit(X_train_raw, y_rule2_train)
    y_pred = clf.predict(X_test_raw)
    acc = accuracy_score(y_rule2_test, y_pred)
    precision = precision_score(y_rule2_test, y_pred, average="micro")
    recall = recall_score(y_rule2_test, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.2f}, precision = {precision:.2f}, recall = {recall:.2f}\n")

(LogisticRegression)
acc = 0.71, precision = 0.71, recall = 0.71

(SVC)
acc = 0.86, precision = 0.86, recall = 0.86

(RandomForestClassifier)
acc = 0.97, precision = 0.97, recall = 0.97

(VotingClassifier)
acc = 0.92, precision = 0.92, recall = 0.92



In this case, our random forest classifier is no longer perfect.

Let's see whether incorporating the cyclic nature improves the situation.

## Cyclic Nature