In [1]:
'''Linear algebra'''
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

'''Data visualisation'''
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot # cf. 5.2

'''Preprocessing'''
from sklearn.impute import SimpleImputer # cf. 5.2
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler,LabelEncoder,OneHotEncoder # cf. 5.2

'''Deep learning'''
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, RNN, SimpleRNN
from tensorflow.keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback

'''Pipeline'''
from sklearn.pipeline import Pipeline,FeatureUnion # cf. 5.6
from sklearn.compose import ColumnTransformer,make_column_selector # cf. 5.6
from sklearn.preprocessing import FunctionTransformer # cf. 5.6
from sklearn.base import TransformerMixin,BaseEstimator # cf. 5.6

# Data preprocessing

## Loading dataset

In [2]:
dataset = pd.read_csv('../raw_data/ytrain_raw.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
dataset_cleaned = dataset.fillna('Down')
dataset_cleaned['timestamp'] = pd.to_datetime(dataset_cleaned['timestamp'])
dataset_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31232 entries, 0 to 31231
Columns: 274 entries, timestamp to S99-T2
dtypes: datetime64[ns, UTC](1), object(273)
memory usage: 65.3+ MB


In [4]:
dataset_cleaned = dataset.set_index('timestamp').fillna('Down')
dataset_cleaned = dataset_cleaned[:10000]

In [5]:
for col in dataset_cleaned.columns:
    dataset_cleaned[col] = dataset_cleaned[col].map({'Available':0, 'Charging':1, 'Down':0, 'Offline':0, 'Passive':0})

In [6]:
# Rename columns
borns_name = list(dataset_cleaned.columns)
stations_name = list(dataset_cleaned.columns.map(lambda x:x[:-3]))
cols_dict = {borns_name[i]: stations_name[i] for i in range(len(borns_name))}

df_stations = dataset_cleaned.rename(columns=cols_dict)

In [7]:
df_stations = df_stations.groupby(level=0,axis=1).sum()

In [8]:
df_stations.head()

Unnamed: 0_level_0,S1,S10,S11,S12,S13,S14,S15,S16,S17,S18,S19,S2,S20,S21,S22,S23,S24,S25,S26,S27,S28,S29,S3,S30,S31,S32,S33,S34,S35,S36,S37,S38,S39,S4,S41,S42,S45,S46,S47,S48,S49,S5,S50,S51,S52,S53,S56,S57,S58,S59,S6,S60,S62,S63,S64,S65,S66,S67,S68,S69,S7,S70,S71,S72,S74,S75,S76,S77,S78,S79,S8,S80,S81,S82,S83,S84,S85,S86,S87,S88,S89,S9,S91,S92,S93,S94,S95,S96,S97,S98,S99
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1
2019-11-25T00:00:00+00:00,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,0,0,2,0,0,1,0,0,0,0,0,1,1,0,0,0,0,2,0,0,0,2,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
2019-11-25T00:15:00+00:00,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,3,0,0,0,0,0,2,0,0,0,0,0,2,0,0,1,0,0,1,0,0,1,1,0,0,0,1,2,0,0,0,2,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2019-11-25T00:30:00+00:00,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,2,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2019-11-25T00:45:00+00:00,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,2,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2019-11-25T01:00:00+00:00,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,3,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,2,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


## Clustering

In [9]:
X_train = np.expand_dims(df_stations.to_numpy().T,axis=-1)

In [10]:
print(X_train.shape)

(91, 10000, 1)


In [None]:
%%time
from tslearn.clustering import TimeSeriesKMeans
km = TimeSeriesKMeans(n_clusters=5, metric="softdtw")
labels = km.fit_predict(X_train)
labels

In [None]:
df_clustering = df_stations.T
df_clustering

In [None]:
df_clustering['cluster_group'] = labels
df_clustering

In [None]:
df_clustering.groupby('cluster_group').mean().T.plot()
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

In [None]:
df_clustering.groupby('cluster_group').count()