In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

import os
import re

In [3]:
%matplotlib

Using matplotlib backend: MacOSX


In [4]:
# Load files
def load_file(path):
    return pd.read_csv(path, sep=";", header=0, infer_datetime_format=True, parse_dates=['timestamp'], index_col=['timestamp'])

In [5]:
# Define dataset to import
ignore_ids = [223, 45, 19, 105, 75, 63, 58, 59]
directory = "../data/processed/batiments/"
def get_selected_ids(selection_):
    # Get ids
    meta = pd.read_csv(directory+"metadata.csv", sep=';')
    # Use selection
    for select_ in selection_:
        if select_.startswith("max_"):
            if selection_[select_] is not None:
                col = select_[4:]
                meta = meta[meta[col] <= selection_[select_]]
        elif select_.startswith("min_"):
            if selection_[select_] is not None:
                col = select_[4:]
                meta = meta[meta[col] >= selection_[select_]]
        elif select_.startswith("in_"):
            if selection_[select_]:
                col = select_[3:]
                meta = meta[meta[col].isin(selection_[select_])]
        else:
            col = select_
            meta = meta[meta[col] == selection_[col]]
    # Remove manual ids
    meta = meta[~meta["bat_id"].isin(ignore_ids)]
    ids_ = meta["bat_id"].values
    return ids_

def get_list_of_datasets(ids_):
    list_of_datasets_ = []
    list_of_ids_ = []
    for filename_ in os.listdir(directory):
        if re.fullmatch("^[0-9]+.csv", filename_) and int(filename_[:filename_.find('.')]) in ids_:
            list_of_datasets_.append(load_file(directory+filename_))
            list_of_ids_.append(int(filename_[:filename_.find('.')]))
    return list_of_datasets_, list_of_ids_

##### Les ids suivants ont été définis manuellement après affichage de tous les datasets, ils pourront donc être exclus lorsque nécessaire

En fonction du pas de temps
- 15
    - à traiter
        - 173, 89, 149, 298, 115, 261, 287, 119, 108, 109, 232, 190, 41, 6
    - bizarres
        - 204, 11, 302, 33, 34, 233, 92, 191, 46, 221, 194, 5, 197, 141, 237
    - courts
        - 65, 205, 238, 48, 305, 11, 246, 34, 257, 23, 44, 225, 194, 5, 43, 142
- 60
    - à traiter
        - 198, 199, 60, 100, 16, 27, 26, 134, 20, 93, 186, 152, 53, 185, 253, 209, 96
    - bizarres
        - 98, 77, 203, 259, 112, 39, 117, 250, 297, 192, 195, 234, 196
    - courts
        - 159, 239, 177, 264, 266, 263, 288, 101, 278, 251, 297, 192, 51, 224, 85, 236, 196


In [None]:
# name of the column with the value to match or put min_, max_ or in_ before the column name to make a condition
selection = {'time_step': 60,
             'min_bat_id': None,
             'max_bat_id': None,
             'in_bat_id': []}

_ignore_60_ids = [198, 199, 60, 100, 16, 27, 26, 134, 20, 93, 186, 152, 53, 185, 253, 209, 96,
                  98, 77, 203, 259, 112, 39, 117, 250, 297, 192, 195, 234, 196,
                  159, 239, 177, 264, 266, 263, 288, 101, 278, 251, 297, 192, 51, 224, 85, 236, 196]

_ignore_15_ids = [173, 89, 149, 298, 115, 261, 287, 119, 108, 109, 232, 190, 41, 6,
                  204, 11, 302, 33, 34, 233, 92, 191, 46, 221, 194, 5, 197, 141, 237,
                  65, 205, 238, 48, 305, 11, 246, 34, 257, 23, 44, 225, 194, 5, 43, 142]

ids = get_selected_ids(selection)

# If you want to ignore certain ids uncomment next line
#ids = [id_ for id_ in ids if id_ not in _ignore_60_ids]

list_of_datasets, list_of_ids = get_list_of_datasets(ids)

In [65]:
# Return last full week that goes monday to sunday
def get_last_full_week(df, step):
    last_df_date = df.index[-1]
    day_delta = last_df_date.isoweekday() - 1
    minutes_delta = last_df_date.hour * 60 + last_df_date.minute
    end_selection_date = last_df_date - pd.Timedelta(days=day_delta, minutes=minutes_delta)
    #start_selection_date = end_selection_date - pd.Timedelta(days=7)

    selection_ =  df[df.index < end_selection_date]
    selection_ = df.iloc[-7*24*60//step:]

    return selection_

# Return last full year 1 jan to 31 dec
def get_last_full_year(df, step):
    selection_year = df.index[-1].year -1
    selection_ = df[df.index.year <= selection_year]
    dim = 365*24*60//step
    selection_ = df.iloc[-dim:]
    if len(selection_) != dim:
        return pd.DataFrame()
    return  selection_

In [47]:
# get values of the length for each dataset for the time_step given
time_step = "week"

scalers = []
values = []
datasets_used = []
for i, dataset_ in enumerate(list_of_datasets):
    if time_step == "week":
        df = get_last_full_week(dataset_, selection['time_step'])
    elif time_step == "year":
        df = get_last_full_year(dataset_, selection['time_step'])
    else:
        raise Exception("time_step is not 'week' or 'year' but {} instead".format(time_step))
    if not df.empty:
        week_values = df['active_power'].values
        datasets_used.append(i)
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_week_values = scaler.fit_transform(week_values.reshape((week_values.shape[0], 1))).flatten()
        scalers.append(scaler)
        values.append(scaled_week_values)

In [87]:
# Run clustering
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters).fit(values)

In [88]:
# Get a list of ids for each cluster
clusters_ids = [[] for _ in range(n_clusters)]
for n in range(n_clusters):
    for i, label in enumerate(kmeans.labels_):
        if label == n:
            clusters_ids[n].append(list_of_ids[datasets_used[i]])
            # Uncomment next two lines to plot every dataset
            #plt.figure("{} - id:{}".format(n, list_of_ids[datasets_used[i]]))
            #plt.plot(values[i])
print(clusters_ids)