In [None]:
The task of time series classification is reduced to a standard supervised classification problem by 
subdividing the data into time windows, and calculating features describing each time window. Classification of each time 
window based on the features is then performed.
 
    Data Preprocessing
- maknuti hand drawn data
- vremenske serija svakih 10 sekundi
    
    Feature engineering
- The choice of time window size is an important hyperparameter as it controls the size of the new dataset 
    and the amount of information in the features
- TSFRESH paket za vremenske serije
- scikitlearn
    
    Feature selection
- Regardless of the method of feature selection, before classification all features should be normalised by 
    subtracting the mean and scaling by the standard deviation of the data
    
    Classification
- The classifier implementations in Scikit-Learn are used, with the Stochastic Gradient Descent optimization technique used for 
    logistic regression and SVC.
- When comparing hyperparameters and model performance it is important to validate correctly to have an unbiased estimate 
    of the classifiers’ performance [14]. To this end 30% of the original events are split into a test set before the work flow.
- The Fl-score is used as the metric to decide the best hyper-parameters and classifier, with the balanced accuracy also 
    reported for additional information.

    
The 3W dataset consists of 1,984 CSV files structured as follows. The subdirectory names are the instances' 
labels. Each file represents one instance. The filename reveals its source. All files are standardized as follow. 
There are one observation per line and one series per column. Columns are separated by commas and decimals are 
separated by periods. The first column contains timestamps, the last one reveals the observations' labels, 
and the other columns are the Multivariate Time Series (MTS) (i.e. the instance itself).


SyntaxError: invalid syntax (<ipython-input-1-aac18f85b863>, line 1)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import glob
import os
import scipy
from pathlib import Path
from multiprocessing.dummy import Pool as ThreadPool
from collections import defaultdict
from natsort import natsorted
#from tsfresh.utilities.dataframe_functions import roll_time_series
from tsfresh import extract_features
from sklearn.model_selection import train_test_split
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.feature_extraction import extract_features
from sklearn import preprocessing

  import pandas.util.testing as tm


In [None]:
pip install tsfresh


In [None]:
data_path = Path('/content/drive/MyDrive', 'data')
events_names = {0: 'Normal',
                1: 'Abrupt Increase of BSW',
                2: 'Spurious Closure of DHSV',
                3: 'Severe Slugging',
                4: 'Flow Instability',
                5: 'Rapid Productivity Loss',
                6: 'Quick Restriction in PCK',
                8: 'Hydrate in Production Line'
               }
columns = ['P-PDG',
           'P-TPT',
           'T-TPT',
           'P-MON-CKP',
           'T-JUS-CKP',
           'P-JUS-CKGL',
           'T-JUS-CKGL',
           'QGL',
           'class']
rare_threshold = 0.01

In [None]:
def class_and_file_generator(data_path, real=False, simulated=False, drawn=False):
    for class_path in data_path.iterdir():
        if class_path.is_dir():
            class_code = int(class_path.stem)
            for instance_path in class_path.iterdir():
                if (instance_path.suffix == '.csv'):
                    if (simulated and instance_path.stem.startswith('SIMULATED')) or \
                       (drawn and instance_path.stem.startswith('DRAWN')) or \
                       (real and (not instance_path.stem.startswith('SIMULATED')) and \
                       (not instance_path.stem.startswith('DRAWN'))):
                        yield class_code, instance_path

In [None]:
real_instances = list(class_and_file_generator(data_path, real=True, simulated=False, drawn=False))
simulated_instances = list(class_and_file_generator(data_path, real=False, simulated=True, drawn=False))
drawn_instances = list(class_and_file_generator(data_path, real=False, simulated=False, drawn=True))

In [None]:
instances_class = [{'TYPE OF EVENT': str(c) + ' - ' + events_names[c], 'SOURCE': 'REAL'} for c, p in real_instances] + \
                  [{'TYPE OF EVENT': str(c) + ' - ' + events_names[c], 'SOURCE': 'SIMULATED'} for c, p in simulated_instances] + \
                  [{'TYPE OF EVENT': str(c) + ' - ' + events_names[c], 'SOURCE': 'DRAWN'} for c, p in drawn_instances]
df_class = pd.DataFrame(instances_class)
#df_class.drop(label = ["TYPE OF EVENT': '7 - Scaling in PCK'"], axis=0)
#df_class.head()
print(df_class)
df_class_count = df_class.groupby(['TYPE OF EVENT', 'SOURCE']).size().reset_index().pivot('SOURCE', 'TYPE OF EVENT', 0).fillna(0).astype(int).T
df_class_count = df_class_count.loc[natsorted(df_class_count.index.values)]
df_class_count = df_class_count[['REAL', 'SIMULATED', 'DRAWN']]
df_class_count['TOTAL'] = df_class_count.sum(axis=1)
df_class_count.loc['TOTAL'] = df_class_count.sum(axis=0)
df_class_count

                       TYPE OF EVENT     SOURCE
0                         0 - Normal       REAL
1                         0 - Normal       REAL
2                         0 - Normal       REAL
3                         0 - Normal       REAL
4                         0 - Normal       REAL
...                              ...        ...
1955  8 - Hydrate in Production Line  SIMULATED
1956  8 - Hydrate in Production Line  SIMULATED
1957  8 - Hydrate in Production Line  SIMULATED
1958  8 - Hydrate in Production Line  SIMULATED
1959  8 - Hydrate in Production Line  SIMULATED

[1960 rows x 2 columns]


KeyError: "['DRAWN'] not in index"

In [None]:
th = rare_threshold*df_class_count['REAL'][-1]
df_class_count.loc[df_class_count['REAL'] < th]

SOURCE,REAL,SIMULATED,DRAWN,TOTAL
TYPE OF EVENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 - Abrupt Increase of BSW,5,114,10,129
6 - Quick Restriction in PCK,6,215,0,221
7 - Scaling in PCK,4,0,10,14
8 - Hydrate in Production Line,3,81,0,84


In [None]:
th = rare_threshold*(df_class_count['REAL'][-1]+df_class_count['SIMULATED'][-1])
df_class_count.loc[df_class_count['REAL']+df_class_count['SIMULATED'] < th]

SOURCE,REAL,SIMULATED,DRAWN,TOTAL
TYPE OF EVENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7 - Scaling in PCK,4,0,10,14


In [None]:
th = rare_threshold*(df_class_count['REAL'][-1]+df_class_count['SIMULATED'][-1]+df_class_count['DRAWN'][-1])
df_class_count.loc[df_class_count['REAL']+df_class_count['SIMULATED']+df_class_count['DRAWN'] < th]

SOURCE,REAL,SIMULATED,DRAWN,TOTAL
TYPE OF EVENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7 - Scaling in PCK,4,0,10,14


In [None]:
instances_class = [{'TYPE OF EVENT': str(c) + ' - ' + events_names[c], 'SOURCE': 'REAL'} for c, p in real_instances] + \
                  [{'TYPE OF EVENT': str(c) + ' - ' + events_names[c], 'SOURCE': 'SIMULATED'} for c, p in simulated_instances] 
df_class = pd.DataFrame(instances_class)
df_class_count = df_class.groupby(['TYPE OF EVENT', 'SOURCE']).size().reset_index().pivot('SOURCE', 'TYPE OF EVENT', 0).fillna(0).astype(int).T
df_class_count = df_class_count.loc[natsorted(df_class_count.index.values)]
df_class_count = df_class_count[['REAL', 'SIMULATED']]
df_class_count['TOTAL'] = df_class_count.sum(axis=1)
df_class_count.loc['TOTAL'] = df_class_count.sum(axis=0)
df_class_count

SOURCE,REAL,SIMULATED,TOTAL
TYPE OF EVENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0 - Normal,597,0,597
1 - Abrupt Increase of BSW,5,114,119
2 - Spurious Closure of DHSV,22,16,38
3 - Severe Slugging,32,74,106
4 - Flow Instability,344,0,344
5 - Rapid Productivity Loss,12,439,451
6 - Quick Restriction in PCK,6,215,221
8 - Hydrate in Production Line,3,81,84
TOTAL,1021,939,1960


In [None]:
#data preprocesing dowsampleing micanje potrebnih stupaca i dodavanje stupca y koji oznacava klasu problema
ddf = {}

for i in range(9):
  if i == 7:
      continue   
  li = []
  path = r'/content/drive/MyDrive/data/{}'.format(i)
  all_files = glob.glob(os.path.join(path, "*.csv"))
  for j in range(len(all_files)):
    df = pd.read_csv(all_files[j], parse_dates=["timestamp"])
    df = df.resample('10S', on='timestamp').mean()
    df = df.reset_index()
    df = df.drop(['P-JUS-CKGL', 'T-JUS-CKGL', 'class'], axis=1).fillna(0)
    df['y'] = i
    li.append(df)

  ddf[i] = pd.concat(li, ignore_index=True)

  for (counter, x) in enumerate(li):
    x.to_csv('/content/drive/MyDrive/data2/{}/{}.cvs'.format(i,counter))

In [None]:
df_all2 = pd.concat(ddf)

In [3]:
#features koje tsfresh racuna
fc_parameters = {
    "mean": None,
    "kurtosis": None,
    "skewness": None,
    "variance": None,
    "fft_aggregated": [ {"aggtype": "centroid"}, {"aggtype": "variance"}, {"aggtype": "skew"}, {"aggtype": "kurtosis"}],
    "maximum": None,
    "minimum": None,
    "median": None,
    "quantile": [{'q': 0.1}, {'q': 0.2}, {'q': 0.3}, {'q': 0.4}, {'q': 0.6}, {'q': 0.7}, {'q': 0.8}, {'q': 0.9}],
    "variation_coefficient": None,
    "mean_change": None,
    "mean_second_derivative_central": None,
    "friedrich_coefficients": [{"m": 1, "r": 3, "coeff": 30}],
    "friedrich_coefficients": [{"m": 3, "r": 3, "coeff": 30}]
}

In [None]:
#verzija featrue extractiona sa rolliing time window nekoristim je jer predugo traje
ddfef = {}
for i in range(9):
  if i == 7:
      continue   
  li = []
  path = r'/content/drive/MyDrive/data2/{}'.format(i)
  all_files = glob.glob(os.path.join(path, "*.cvs"))
  for j in range(len(all_files)):
    df = pd.read_csv(all_files[j], parse_dates=["timestamp"], index_col=0)
    df_rolled = roll_time_series(df, column_id='y', column_sort="timestamp", max_timeshift=300, min_timeshift=300)
    df_rolled = df_rolled.drop(['y'], axis=1)
    df_features = extract_features(df_rolled, column_id="id", column_sort="timestamp", default_fc_parameters=fc_parameters)
    df_features = df_features.reset_index()
    df_features = df_features.rename(columns={'level_0': 'klasa', 'level_1': 'timestamp'})
    df_features = df_features.fillna(0)
    li.append(df_features)

  ddfef[i] = pd.concat(li, ignore_index=True)


In [None]:
#featue extraction nakon podjele svakog csv filea na prozore i exctraction featrua iz svakog prozora
ddfef = {}

for i in range(9):
  if i == 7:
      continue   
  li2 = []
  path = r'/content/drive/MyDrive/data2/{}'.format(i)
  all_files = glob.glob(os.path.join(path, "*.cvs"))
  for j in range(len(all_files)):
    df = pd.read_csv(all_files[j], parse_dates=["timestamp"], index_col=0)
    df = df.fillna(0)
    x = len(df)//300
    x = x * 300
    li = []
    for k in range(0,x,300):
      dft = df[k:(k+300)]
      df_features = extract_features(dft, column_id="y", column_sort="timestamp", default_fc_parameters=fc_parameters, disable_progressbar=True)
      df_features = df_features.reset_index()
      df_features = df_features.rename(columns={'level_0': 'klasa', 'level_1': 'timestamp'})
      df_features = df_features.fillna(0)
      df_features['window_id'] = '{}-{}-{}'.format(i,j,k)
      li.append(df_features)
    dftt = pd.concat(li, ignore_index=True)
    li2.append(dftt)

  ddfef[i] = pd.concat(li2, ignore_index=True)

In [None]:
  
li2 = []
path = r'/content/drive/MyDrive/data2/{}'.format(8)
all_files = glob.glob(os.path.join(path, "*.cvs"))
for i in range(len(all_files)):
    df = pd.read_csv(all_files[i], parse_dates=["timestamp"], index_col=0)
    df = df.fillna(0)
    x = len(df)//300
    x = x * 300
    if len(df)<300:
      continue
    li = []
    for k in range(0,x,300):
      dft = df[k:(k+300)]
      df_features = extract_features(dft, column_id="y", column_sort="timestamp", default_fc_parameters=fc_parameters, disable_progressbar=True)
      df_features = df_features.reset_index()
      df_features = df_features.rename(columns={'level_0': 'klasa', 'level_1': 'timestamp'})
      df_features = df_features.fillna(0)
      df_features['window_id'] = '{}-{}-{}'.format(8,i,k)
      li.append(df_features)
    dftt = pd.concat(li, ignore_index=True)
    li2.append(dftt)

df_8 = pd.concat(li2, ignore_index=True)
df_8.to_csv('/content/drive/MyDrive/data2/300-8.csv')

In [84]:
df_6 = pd.read_csv('/content/drive/MyDrive/data2/300-6.csv', index_col=0)
df_6 = df_6.rename(columns={'index': 'klasa'})

df_6['window_id'] = df_6['window_id'].map(lambda x: x.lstrip('0123456789').rstrip('0123456789'))
df_6['window_id'] = df_6['window_id'].map(lambda x: x.lstrip('-').rstrip('-'))
df_6['window_id'] = df_6['klasa'].astype(str) + df_6['window_id']
#df_0['window_id'] = df_0['window_id'].astype(int)

In [85]:

df_300 = pd.concat([df_0, df_1, df_2, df_3, df_4, df_5, df_6, df_8], ignore_index=True)

In [86]:
df_300.to_csv('/content/drive/MyDrive/data2/300.csv')

In [291]:
#podjela na train i test i standardiziranje skupa za ucenje
#svi prozori za jedan dogadaj su sadrzani u train ili test skupu

from sklearn.model_selection import GroupShuffleSplit

gs = GroupShuffleSplit(n_splits=2, test_size=.3, random_state=0)

train_ix, test_ix = next(gs.split(df_300, groups=df_300['window_id']))

X_train = df_300.loc[train_ix]
X_test = df_300.loc[test_ix]
X_train = X_train.drop(['window_id'], axis=1)
X_test = X_test.drop(['window_id'], axis=1)
y_train = X_train[['klasa']]
y_test = X_test[['klasa']]
X_train = X_train.drop(['klasa'], axis=1)
X_test = X_test.drop(['klasa'], axis=1)



In [119]:
#klasifikacija LDA metodom
#koristimo kfold cross validation i standardiziramo podatke on held fold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf = LinearDiscriminantAnalysis()
clf.fit(X_train_transformed, np.ravel(y_train))
X_test_transformed = scaler.transform(X_test)
print(clf.score(X_test_transformed, y_test))

y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average='weighted'))
#clf = make_pipeline(preprocessing.StandardScaler(), LinearDiscriminantAnalysis())
#cross_val_score(clf, X_train, np.ravel(y_train), cv=5)




0.9988409828465461
0.0057920691472809135


  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [292]:
#klasifikacija LDA metodom
#koristimo kfold cross validation i standardiziramo podatke on held fold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

clf = make_pipeline(preprocessing.StandardScaler(), LinearDiscriminantAnalysis())
scr = cross_val_score(clf, X_train, np.ravel(y_train), cv=5)

clf.fit(X_train, np.ravel(y_train))
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average='weighted'))

0.9052853136106472


In [None]:
#klasifikacija QDA metodom
#koristimo kfold cross validation i standardiziramo podatke on held fold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

clf = make_pipeline(preprocessing.StandardScaler(), QuadraticDiscriminantAnalysis())
scr = cross_val_score(clf, X_train, np.ravel(y_train), cv=5)

In [None]:
# djeljenje na prozore od 90s
ddfef2 = {}

for i in range(9):
  if i == 7:
      continue 
  b = 0 
  li2 = []
  path = r'/content/drive/MyDrive/data2/{}'.format(i)
  all_files = glob.glob(os.path.join(path, "*.cvs"))
  for j in range(len(all_files)):
    df = pd.read_csv(all_files[j], parse_dates=["timestamp"], index_col=0)
    x = len(df)//90
    x = x * 90
    li = []
    for k in range(0,x,90):
      b = b + 1
      dft = df.loc[k:(k+90)]
      dft['window_id'] = '{}-{}-{}'.format(i,j,b)
      li.append(dft)
    dfe = pd.concat(li, ignore_index=True)
    li2.append(dfe)
  ddfef2[i] = pd.concat(li2, ignore_index=True)

In [148]:
df_900_windows.to_csv('/content/drive/MyDrive/data2/9w.csv')

In [277]:

df = ddfef2[8]
df = df.drop(['y'], axis=1)
df_features = extract_features(df, column_id="window_id", column_sort="timestamp", default_fc_parameters=fc_parameters, disable_progressbar=True)
df_features['klasa'] = '{}'.format(8)
df_8 = df_features
df_8 = df_8.reset_index()
df_8 = df_8.rename(columns={'index': 'window_id'})

In [278]:
#sredivanje stupaca
df_8['window_id'] = df_8['window_id'].map(lambda x: x.lstrip('0123456789').rstrip('0123456789'))
df_8['window_id'] = df_8['window_id'].map(lambda x: x.lstrip('-').rstrip('-'))
df_8['window_id'] = df_8['klasa'].astype(str) + df_8['window_id']

In [206]:
path = r'/content/drive/MyDrive/data2/9wf/{}.csv'.format(0)
df = pd.read_csv(path, parse_dates=["timestamp"], index_col=0)
df = df.drop(['y'], axis=1)
df_features = extract_features(df, column_id="window_id", column_sort="timestamp", default_fc_parameters=fc_parameters, disable_progressbar=True)
df_features['klasa'] = '{}'.format(0)
df_0 = df_features

In [279]:
df_900 = pd.concat([df_0, df_1, df_2, df_3, df_4, df_5, df_6, df_8], ignore_index=True)
df_900.to_csv('/content/drive/MyDrive/data2/900.csv')

In [295]:
#podjela na train i test i standardiziranje skupa za ucenje
#svi prozori za jedan dogadaj su sadrzani u train ili test skupu

from sklearn.model_selection import GroupShuffleSplit

gs = GroupShuffleSplit(n_splits=2, test_size=.3, random_state=0)

train_ix, test_ix = next(gs.split(df_900, groups=df_900['window_id']))

X_train = df_900.loc[train_ix]
X_test = df_900.loc[test_ix]
X_train = X_train.drop(['window_id'], axis=1)
X_test = X_test.drop(['window_id'], axis=1)
y_train = X_train[['klasa']]
y_test = X_test[['klasa']]
X_train = X_train.drop(['klasa'], axis=1)
X_test = X_test.drop(['klasa'], axis=1)

In [298]:
#klasifikacija logističkom regresijom 
#koristimo kfold cross validation i standardiziramo podatke on held fold
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
clf = make_pipeline(preprocessing.StandardScaler(), OneVsRestClassifier(LogisticRegression(C=1e-4)))
scr = cross_val_score(clf, X_train, np.ravel(y_train), cv=5)

clf.fit(X_train, np.ravel(y_train))
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average='weighted'))

0.753175808894774


In [300]:
#klasifikacija random forest
#koristimo kfold cross validation i standardiziramo podatke on held fold
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
clf = make_pipeline(preprocessing.StandardScaler(), RandomForestClassifier(n_estimators=150, max_depth=10, max_features=5))
scr = cross_val_score(clf, X_train, np.ravel(y_train), cv=5)

clf.fit(X_train, np.ravel(y_train))
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average='weighted'))

0.9809862324841471


In [301]:
print(scr)

[0.98175133 0.96554542 0.95777717 0.91691637 0.82048488]
