In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
DARK_READER = True
if DARK_READER:
    plt.rcParams.update({
        "lines.color": "white",
        "patch.edgecolor": "white",
        "text.color": "black",
        "axes.facecolor": "black",
        "axes.edgecolor": "lightgray",
        "axes.labelcolor": "white",
        "xtick.color": "white",
        "ytick.color": "white",
        "grid.color": "lightgray",
        "figure.facecolor": "black",
        "figure.edgecolor": "black",
        "savefig.facecolor": "black",
        "savefig.edgecolor": "black",
    })

# Season Dataset (Version 4)
In this version, we will have similar `month, day` rule to the one in version 3 and in `y_rule2`. That is,

- Spring: 15th March to 14th June (inclusive)
- Summer: 15th June to 14th September
- Autumn: 15th September to 14th December
- Winter: 15th December to 14th March

In addition to that, we have added `latitude, longitude` features to separate the Earth into Northern/Southern semispheres,
whose seaon should be the opposite.

Besides, from this notebook on, I have decided not to use ensemble-method classifiers (e.g. random forest), to better see what single classifiers can achieve and to better make comparison.

In [3]:
from constants import *
from utils import *

We need a few tools to build this larger dataset with new features `latitud, longitude`.

In [4]:
import collections

In [5]:
list(zip(range(3), range(4, 4+3), range(-10-3, -10)))

[(0, 4, -13), (1, 5, -12), (2, 6, -11)]

In [6]:
import itertools

In [7]:
list(itertools.product(range(3), range(4, 4+3), range(-10-3, -10)))

[(0, 4, -13),
 (0, 4, -12),
 (0, 4, -11),
 (0, 5, -13),
 (0, 5, -12),
 (0, 5, -11),
 (0, 6, -13),
 (0, 6, -12),
 (0, 6, -11),
 (1, 4, -13),
 (1, 4, -12),
 (1, 4, -11),
 (1, 5, -13),
 (1, 5, -12),
 (1, 5, -11),
 (1, 6, -13),
 (1, 6, -12),
 (1, 6, -11),
 (2, 4, -13),
 (2, 4, -12),
 (2, 4, -11),
 (2, 5, -13),
 (2, 5, -12),
 (2, 5, -11),
 (2, 6, -13),
 (2, 6, -12),
 (2, 6, -11)]

In [8]:
import pandas as pd

In [9]:
df_date = pd.DataFrame(
    L_month_day,
    columns=["month", "day"]
)
df_date.head()

Unnamed: 0,month,day
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5


In [10]:
df_geo = pd.DataFrame(
    itertools.product(latitudes, longitudes),
    columns=["latitude", "longitude"],
)
df_geo

Unnamed: 0,latitude,longitude
0,-89,-179
1,-89,-178
2,-89,-177
3,-89,-176
4,-89,-175
...,...,...
64435,89,176
64436,89,177
64437,89,178
64438,89,179


View or copy? (Pandas)

In [11]:
np.all(df_date["month"] == df_date.loc[:, "month"])

True

In [12]:
df_date["month"] is df_date.loc[:, "month"]

True

Seems that both of them give **view**.

In [13]:
date_split = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=SEED)
geo_split = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=SEED)
for date_train_indices, date_test_indices in date_split.split(df_date, df_date["month"]):
    pass
#for geo_train_indices, geo_test_indices in geo_split.split(df_geo, df_geo["longitude"]):
for geo_train_indices, geo_test_indices in geo_split.split(df_geo, df_geo["latitude"]):
    pass

print(f"date_test_indices = {date_test_indices}")
print(f"geo_test_indices = {geo_test_indices}")

date_test_indices = [ 95 255 146 339 152  93 349  27 264 127  87 136 260  36 156 230  50 253
 257 205 222  73 211 238  19 128 358 319 299 101 111 302   5 213 310 151
 174  45 321 138 282  43  20 194 229 361 326 242 313 274 108 208  57  76
 132 116 311 297 240 179 280  10 243 209 184  79 173 350  37  63  74 344
   6]
geo_test_indices = [  550 44755 15477 ... 51936   457 58417]


In [14]:
type(geo_test_indices)

numpy.ndarray

In [15]:
date_test_indices

array([ 95, 255, 146, 339, 152,  93, 349,  27, 264, 127,  87, 136, 260,
        36, 156, 230,  50, 253, 257, 205, 222,  73, 211, 238,  19, 128,
       358, 319, 299, 101, 111, 302,   5, 213, 310, 151, 174,  45, 321,
       138, 282,  43,  20, 194, 229, 361, 326, 242, 313, 274, 108, 208,
        57,  76, 132, 116, 311, 297, 240, 179, 280,  10, 243, 209, 184,
        79, 173, 350,  37,  63,  74, 344,   6])

In [16]:
print(f"df_date.iloc[date_test_indices]['month'].value_counts() =\n{df_date.iloc[date_test_indices]['month'].value_counts()}")
print()
print(f"df_geo.iloc[geo_test_indices]['latitude'].value_counts() =\n{df_geo.iloc[geo_test_indices]['latitude'].value_counts()}")

df_date.iloc[date_test_indices]['month'].value_counts() =
8     7
1     6
2     6
3     6
4     6
5     6
6     6
7     6
9     6
10    6
11    6
12    6
Name: month, dtype: int64

df_geo.iloc[geo_test_indices]['latitude'].value_counts() =
 0     72
-2     72
-11    72
 13    72
-19    72
       ..
-62    72
 66    72
-70    72
 74    72
-89    72
Name: latitude, Length: 179, dtype: int64


In [17]:
df1 = pd.DataFrame({'col1': [1,2], 'col2': [3,4]})
df2 = pd.DataFrame({'col3': [5,6]})    

df1.merge(df2, how='cross')

Unnamed: 0,col1,col2,col3
0,1,3,5
1,1,3,6
2,2,4,5
3,2,4,6


In [18]:
df_train = pd.merge(
    df_date.iloc[date_train_indices],
    df_geo.iloc[geo_train_indices],
    how="cross",
)
df_train

Unnamed: 0,month,day,latitude,longitude
0,11,25,-87,16
1,11,25,24,60
2,11,25,-84,165
3,11,25,83,8
4,11,25,-55,71
...,...,...,...,...
15053179,3,12,-89,97
15053180,3,12,25,103
15053181,3,12,78,34
15053182,3,12,-46,107


In [19]:
df_test = pd.merge(
    df_date.iloc[date_test_indices],
    df_geo.iloc[geo_test_indices],
    how="cross",
)
df_test

Unnamed: 0,month,day,latitude,longitude
0,4,6,-88,11
1,4,6,35,-64
2,4,6,-47,178
3,4,6,9,-139
4,4,6,-73,-60
...,...,...,...,...
940819,1,7,-71,-144
940820,1,7,39,-130
940821,1,7,55,-83
940822,1,7,-88,-82


In [20]:
df3 = pd.DataFrame({
    'item': ['refrigerator', 'fan', 'laptop', 'light_bulb'],
    'price': [10000, 5000, 15000, 2000],
})
df3

Unnamed: 0,item,price
0,refrigerator,10000
1,fan,5000
2,laptop,15000
3,light_bulb,2000


In [21]:
discount = 0.1
df3['discounted'] = df3.apply(
    lambda row: int(row.price * (1 - discount)),
    axis = 1,
)
df3

Unnamed: 0,item,price,discounted
0,refrigerator,10000,9000
1,fan,5000,4500
2,laptop,15000,13500
3,light_bulb,2000,1800


In [22]:
%%time
y_rule3_train = df_train.apply(rule3, axis=1).values
y_rule3_train

CPU times: user 5min 29s, sys: 1.55 s, total: 5min 30s
Wall time: 5min 33s


array([0, 2, 0, ..., 3, 1, 1])

**(?)** The above cell takes a long time to execute. Any way to accelerate it?<br>

In [23]:
%%time
y_rule3_test = df_test.apply(rule3, axis=1).values
y_rule3_test

CPU times: user 21.4 s, sys: 75.2 ms, total: 21.5 s
Wall time: 21.6 s


array([2, 0, 2, ..., 3, 1, 3])

In [24]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score

In [25]:
log_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()
#svm_clf = SVC(probability=True)
svm_clf = SVC()
#rnd_clf = RandomForestClassifier()
sgd_clf = SGDClassifier(random_state=SEED)
T_classifiers = (
    log_clf,
    tree_clf,
    #svm_clf,
    sgd_clf
)

I have commented out `svm_clf` because it takes too long for it to train (sth like more than 2 hours on Aero.)

In [28]:
from tqdm.notebook import tqdm

In [31]:
%%time
for clf in T_classifiers:
    clf.fit(df_train, y_rule3_train)
    #clf.fit(df_train, df_y_rule3_train)
    y_pred = clf.predict(df_test)

    acc = accuracy_score(y_rule3_test, y_pred)
    precision = precision_score(y_rule3_test, y_pred, average="weighted")
    recall = recall_score(y_rule3_test, y_pred, average="weighted")

    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")

(LogisticRegression)
acc = 0.3090, precision = 0.2980, recall = 0.3090

(DecisionTreeClassifier)
acc = 0.9726, precision = 0.9740, recall = 0.9726

(SGDClassifier)
acc = 0.2664, precision = 0.2253, recall = 0.2664

CPU times: user 15min 40s, sys: 59.6 s, total: 16min 40s
Wall time: 12min 38s


In [32]:
# What about performance on the training data?
for clf in T_classifiers:
    y_pred = clf.predict(df_train)
    acc = accuracy_score(y_rule3_train, y_pred)
    average = "weighted"
    precision = precision_score(y_rule3_train, y_pred, average=average)
    recall = recall_score(y_rule3_train, y_pred, average=average)
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")

  0%|          | 0/3 [00:00<?, ?it/s]

(LogisticRegression)
acc = 0.3540, precision = 0.3601, recall = 0.3540

(DecisionTreeClassifier)
acc = 1.0000, precision = 1.0000, recall = 1.0000

(SGDClassifier)
acc = 0.2738, precision = 0.2331, recall = 0.2738



## To What Extent Can Considering Cyclic Nature Help?

In [None]:
X

In [None]:
y_rule1 = []
for month, _ in L_month_day:
    if MAR <= month <= MAY:
        y_rule1.append(SPRING)
    elif JUN <= month <= AUG:
        y_rule1.append(SUMMER)
    elif SEP <= month <= NOV:
        y_rule1.append(AUTUMN)
    else:
        y_rule1.append(WINTER)
y_rule1[0:-1:31]

## How to Split Train/Test sets?

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
y_rule1 = np.array(y_rule1)

In [None]:
split = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=SEED)
for train_indices, test_indices in split.split(X, X[:, 0]):
    pass
X_train_raw = X[train_indices, :]
X_test_raw = X[test_indices, :]
y_rule1_train = y_rule1[train_indices]
y_rule1_test = y_rule1[test_indices]

## First Dataset, First Model
Just take `X_train_raw` and `X_test_raw` to train a few ML models and see what that gives.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
log_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()
#svm_clf = SVC(probability=True)
svm_clf = SVC()
#rnd_clf = RandomForestClassifier()
T_classifiers = (log_clf, tree_clf, svm_clf)

In [None]:
for clf in T_classifiers:
    clf.fit(X_train_raw, y_rule1_train)
    y_pred = clf.predict(X_test_raw)
    
    acc = accuracy_score(y_rule1_test, y_pred)

    ## ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
    #precision = precision_score(y_rule1_test, y_pred)
    ## TypeError: unsupported format string passed to numpy.ndarray.__format__
    #precision = precision_score(y_rule1_test, y_pred, average=None)
    precision = precision_score(y_rule1_test, y_pred, average="weighted")

    #recall = recall_score(y_rule1_test, y_pred)
    #recall = recall_score(y_rule1_test, y_pred, average=None)
    recall = recall_score(y_rule1_test, y_pred, average="weighted")

    print(f"({clf.__class__.__name__})\nacc = {acc:.2f}, precision = {precision:.2f}, recall = {recall:.2f}\n")


In [None]:
# What about performance on the training data?
for clf in T_classifiers:
    clf.fit(X_train_raw, y_rule1_train)
    y_pred = clf.predict(X_train_raw)
    acc = accuracy_score(y_rule1_train, y_pred)
    precision = precision_score(y_rule1_train, y_pred, average="micro")
    recall = recall_score(y_rule1_train, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.2f}, precision = {precision:.2f}, recall = {recall:.2f}\n")

We see that

- Random forest performances perfectly. So does decision tree.

Maybe the task is too simple. Even without taking the cyclic nature into consideration, some of the classifiers
can already reach near perfect performance.


# Season Dataset (Version 3)
In this version, we will have exactly the same features like in version 2.<br>
Only that we set a diff rule to increase the difficulty:

- Spring: 15th March to 14th June (inclusive)
- Summer: 15th June to 14th September
- Autumn: 15th September to 14th December
- Winter: 15th December to 14th March

**N.B.** Note that we just need to create a new label, `y_rule2`. No need to modify `X`.

In [None]:
y_rule2 = []
start = 15
end = 14
for month_day in L_month_day:
    if [MAR,start] <= month_day <= [JUN,end]:
        y_rule2.append(SPRING)
    elif [JUN,start] <= month_day <= [SEP,end]:
        y_rule2.append(SUMMER)
    elif [SEP,start] <= month_day <= [DEC,end]:
        y_rule2.append(AUTUMN)
    else:
        y_rule2.append(WINTER)

y_rule2[0:-1:31]

In [None]:
y_rule2 = np.array(y_rule2)

In [None]:
y_rule2_train = y_rule2[train_indices]
y_rule2_test = y_rule2[test_indices]

In [None]:
for clf in T_classifiers:
    clf.fit(X_train_raw, y_rule2_train)
    y_pred = clf.predict(X_test_raw)
    acc = accuracy_score(y_rule2_test, y_pred)
    precision = precision_score(y_rule2_test, y_pred, average="micro")
    recall = recall_score(y_rule2_test, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")

In this case, our <s>random forest</s> and decision tree classifies are no longer perfect (even though still perform with high accuracy).

In [None]:
# On training set
for clf in T_classifiers:
    y_pred = clf.predict(X_train_raw)
    acc = accuracy_score(y_rule2_train, y_pred)
    precision = precision_score(y_rule2_train, y_pred, average="micro")
    recall = recall_score(y_rule2_train, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")

Let's see whether incorporating the cyclic nature improves the situation.

## Cyclic Nature

In [None]:
from utils import *

In [None]:
X_cyclic = np.c_[
    cyclicize_series(X[:, 0], max_=12, min_=0),
    cyclicize_series(X[:, 1], max_=31, min_=0),
]

In [None]:
X_train_cyclic = X_cyclic[train_indices]
X_test_cyclic = X_cyclic[test_indices]

In [None]:
for clf in T_classifiers:
    clf.fit(X_train_cyclic, y_rule2_train)
    y_pred = clf.predict(X_test_cyclic)
    acc = accuracy_score(y_rule2_test, y_pred)
    precision = precision_score(y_rule2_test, y_pred, average="micro")
    recall = recall_score(y_rule2_test, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")

In [None]:
# On training set
for clf in T_classifiers:
    y_pred = clf.predict(X_train_cyclic)
    acc = accuracy_score(y_rule2_train, y_pred)
    precision = precision_score(y_rule2_train, y_pred, average="micro")
    recall = recall_score(y_rule2_train, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")

## Putting Together for Comparison

In [None]:
# Rule 2, non-cyclic
for clf in T_classifiers:
    clf.fit(X_train_raw, y_rule2_train)
    y_pred = clf.predict(X_test_raw)
    acc = accuracy_score(y_rule2_test, y_pred)
    precision = precision_score(y_rule2_test, y_pred, average="micro")
    recall = recall_score(y_rule2_test, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")

In [None]:
# Rule 2, cyclic
for clf in T_classifiers:
    clf.fit(X_train_cyclic, y_rule2_train)
    y_pred = clf.predict(X_test_cyclic)
    acc = accuracy_score(y_rule2_test, y_pred)
    precision = precision_score(y_rule2_test, y_pred, average="micro")
    recall = recall_score(y_rule2_test, y_pred, average="micro")    
    print(f"({clf.__class__.__name__})\nacc = {acc:.4f}, precision = {precision:.4f}, recall = {recall:.4f}\n")