In [48]:
import pandas as pd
import numpy as np
import random
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [49]:
df = pd.read_csv("../possible_datasets/M4/train.csv")
static_features_df = pd.read_csv("../possible_datasets/M4/metadata.csv")
# SMOTE needs later at least >= 5 samples to produce meaningful results so drop the items with static feature Macro
drop_ids = static_features_df.loc[static_features_df.domain == "Macro", "item_id"]
static_features_df = static_features_df.loc[~static_features_df.item_id.isin(drop_ids)]
df = df.loc[~df.item_id.isin(drop_ids)]

WEEKEND_INDICES = [5, 6]
df["weekend"] = pd.DatetimeIndex(df["timestamp"].astype('datetime64[ns]').values).weekday.isin(WEEKEND_INDICES)
df_merged = pd.merge(df, static_features_df, on="item_id")

# Use sampling strategy SMOTE

In [134]:
m4_data = df_merged.copy()
m4_data.pivot_table(index="item_id", aggfunc="first")["domain"].value_counts()

domain
Micro       43
Finance     36
Industry    11
Other        8
Name: count, dtype: int64

Bring Other and Industry up to 30 datapoints

In [135]:
m4_reduced = pd.DataFrame()
for id in m4_data["item_id"].unique():
    m4_id = m4_data[m4_data["item_id"] == id].reset_index(drop=True)
    first_day = pd.to_datetime(m4_id['timestamp']).dt.weekday[0]
    m4_id = m4_id.iloc[7-first_day:107-first_day, :]
    m4_id = m4_id.reset_index(drop=True).reset_index(names="time_index")
    m4_reduced = pd.concat([m4_reduced, m4_id], axis=0)
m4_reduced

Unnamed: 0,time_index,item_id,timestamp,target,weekend,domain
0,0,D1737,1995-05-29,1862.0,False,Industry
1,1,D1737,1995-05-30,1863.0,False,Industry
2,2,D1737,1995-05-31,1858.0,False,Industry
3,3,D1737,1995-06-01,1858.0,False,Industry
4,4,D1737,1995-06-02,1856.0,False,Industry
...,...,...,...,...,...,...
95,95,D2345,2004-01-09,4873.3,False,Finance
96,96,D2345,2004-01-10,4822.0,True,Finance
97,97,D2345,2004-01-11,5143.7,True,Finance
98,98,D2345,2004-01-12,4846.0,False,Finance


In [136]:
m4_reduced["timestamp"] = pd.to_datetime(m4_reduced["timestamp"]).astype("int64") / 1e9

In [137]:
m4_reduced

Unnamed: 0,time_index,item_id,timestamp,target,weekend,domain
0,0,D1737,8.017056e+08,1862.0,False,Industry
1,1,D1737,8.017920e+08,1863.0,False,Industry
2,2,D1737,8.018784e+08,1858.0,False,Industry
3,3,D1737,8.019648e+08,1858.0,False,Industry
4,4,D1737,8.020512e+08,1856.0,False,Industry
...,...,...,...,...,...,...
95,95,D2345,1.073606e+09,4873.3,False,Finance
96,96,D2345,1.073693e+09,4822.0,True,Finance
97,97,D2345,1.073779e+09,5143.7,True,Finance
98,98,D2345,1.073866e+09,4846.0,False,Finance


In [138]:
m4_reduced = m4_reduced.pivot(index=["item_id", "domain"], columns="time_index", values="target").reset_index().drop("item_id", axis=1)

m4_scaled = m4_reduced.copy()
#scaler = StandardScaler()
#for c in m4_scaled.columns:
#    if c != "domain":
#        m4_scaled[c] = scaler.fit_transform(m4_scaled[c].values.reshape(-1, 1))
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(m4_scaled.drop("domain", axis=1), m4_scaled['domain'])

In [144]:
df_synth = pd.concat([X_res, y_res], axis=1)

In [155]:
df_long = df_synth.reset_index().melt(
    id_vars=["index", "domain"],
    var_name="TimeIndex",
    value_name="Target"
)
df_long = df_long.rename(columns={"index": "item_id"})

In [159]:
df_long = df_long.sort_values(by=["item_id", "TimeIndex"]).reset_index(drop=True)

In [162]:
# fixed colors for four domains
color_map = {
    "Industry": "#1f77b4",
    "Finance":  "#ff7f0e",
    "Micro":    "#2ca02c",
    "Other":    "#d62728"
}

fig = go.Figure()

for dom in df_long.domain.unique():
    df_domain = df_long.loc[df_long.domain == dom]
    for idx, item in enumerate(df_domain.item_id.unique()):
        df_id = df_domain.loc[df_domain.item_id == item].sort_values(by="TimeIndex")
        fig.add_trace(
            go.Scatter(
                x=df_id.TimeIndex,
                y=df_id.Target,
                mode='lines',
                name=dom,
                legendgroup=dom,
                showlegend=(idx == 0),
                line=dict(color=color_map.get(dom, "#7f7f7f"))
            )
        )

fig.show()