In [33]:
import setuptools.dist
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import mne
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
import pickle

# ---------------------- CONSTANTES -----------------------
TRAIN_PATH = "../data/raw/train/"
TRAIN_ONLY_PATH = "../data/raw/y_train_only/"
MODELS_PATH= "../models/"
MODELS_TEST_PATH= "../models/test/"

FILTERS =  [{ "name": "std" ,"l_cut": 1, "h_cut": 30 },]
SAMPLING_FREQ = 250
STICHANNEL = "STI101"
CHANNELS_LIST = ["C3", "Cz", "C4", STICHANNEL]
CH_TYPES=["eeg", "eeg", "eeg", "stim"]
REFERENCE = [["Cz"]]
MONTAGE = mne.channels.make_standard_montage("standard_1020")
FREQ_BANDS = {
        "theta": [4.5, 8.5],
        "alpha": [8.5, 11.5],
        "sigma": [11.5, 15.5],
        "beta": [15.5, 30],
    }

""" Sélection des noms de fichiers ne concernant que les runs 3 """
def initFilesNames():
    filesNames = []
    for x in range(100, 1000, 100):
        filesNames.append("B0" + str(x + 3) + "T")
    return filesNames

DATA_FILES = initFilesNames()

# ---------------------- UTILITAIRES -----------------------
""" Etape de preprocessing d'un fichier de tentatives
    qui aboutit à la créations d'epochs
"""
def file_preprocessing(
    fileName, tmin=0, tmax=3.5, lFilter=0, hFilter=30, reference="average"
):
    # Lecture de fichier de caractérisques des tentatives et merge avec le fichier de labels
    df_mne = pd.read_csv(TRAIN_PATH + fileName + ".csv")
    df_event_type = pd.read_csv(TRAIN_ONLY_PATH + fileName + ".csv")
    df_event_start = df_mne.loc[(df_mne.EventStart == 1), ["time"]]
    df_event_start = df_event_start.reset_index(drop=True)
    df_event_start_with_type = pd.merge(
        df_event_start, df_event_type, left_index=True, right_index=True
    )
    df_mne = pd.merge(df_mne, df_event_start_with_type, how="left", on=["time", "time"])
    df_mne.drop(
        ["EOG:ch01", "EOG:ch02", "EOG:ch03", "EventStart"], axis=1, inplace=True
    )
    df_mne = df_mne.rename(columns={"EventType": STICHANNEL})
    
    # Mise à l'échelle des données 
    df_mne.C3 = df_mne.C3 / 1000000
    df_mne.Cz = df_mne.Cz / 1000000
    df_mne.C4 = df_mne.C4 / 1000000
    
    # Labellisation 1 - 2 au lieu de 0 - 1 nécessaire pour MNE
    df_mne.replace({STICHANNEL: 1}, 2, inplace=True)
    df_mne.replace({STICHANNEL: 0}, 1, inplace=True)
    df_mne.fillna({STICHANNEL: 0}, inplace=True)
    data = pd.DataFrame.to_numpy(df_mne[CHANNELS_LIST].transpose(), dtype=np.float64)
    info = mne.create_info(
        ch_names=CHANNELS_LIST,
        sfreq=SAMPLING_FREQ,
        ch_types=CH_TYPES,
    )
    
    # Création des objets MNE avec intégrations stimuli
    raw = mne.io.RawArray(data, info)
    raw.set_montage(MONTAGE)
    events = mne.find_events(raw, stim_channel=STICHANNEL, consecutive=False)
    mapping = {1: "left", 2: "right"}
    annot_from_events = mne.annotations_from_events(
        events=events,
        event_desc=mapping,
        sfreq=raw.info["sfreq"],
        orig_time=raw.info["meas_date"],
    )
    raw.set_annotations(annot_from_events)
    
    # Application d'un filtre passe bande
    raw_filt = raw .filter(l_freq=lFilter, h_freq=hFilter)
    events = mne.find_events(raw_filt, stim_channel=CHANNELS_LIST, consecutive=False)
    
    # Création des epochs avec application d'une correction de baseline
    epochs = mne.Epochs(
        raw_filt, events, tmin=tmin, tmax=tmax, baseline=(-0.2, 0), preload=True
    )

    # Application d'une référence aux epochs
    epochs_ref = epochs.set_eeg_reference(ref_channels=reference)
    return epochs_ref

""" Application d'un model et alimentation création d'une entrée de rapport """
def apply_model(model, title, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    pickle.dump(model, open(MODELS_PATH + title, 'wb'))
    y_pred = model.predict(X_test)
    return {
        "title": title,
        "score": accuracy_score(y_test, y_pred, normalize=True),
        "confusion": confusion_matrix(y_test, y_pred),
        "classification_report": classification_report(y_test, y_pred),
    }

""" Application de différents models et création d'un rapport """
def apply_models(X_train, X_test, y_train, y_test, report_entries):
    report_entries.append(
        apply_model(
            LogisticRegression(max_iter=2500),
            "LogisticRegression",
            X_train,
            y_train,
            X_test,
            y_test,
        )
    )

""" Edition du rapport d'exécution des models """
def print_report(report):
    for r in report:
        print(
            "------------------------------------------------------------------------------------------"
        )
        print(
            "min:",
            r["range"]["min"],
            "max: ",
            r["range"]["max"],
            "lowCut: ",
            r["filter"]["low"],
            "hightCut: ",
            r["filter"]["hight"],
            "reference: ",
            r["reference"],
        )
        print(
            "------------------------------------------------------------------------------------------"
        )
        for e in r["entries"]:
            print(e["title"], "score: ", e["score"])
            print(e["confusion"])
            print(e["classification_report"])

""" Création des jeux d'entrainement et de test
    Les données des candidats 1 à 7 servent à l'entrainement,
    celles des candidats 8 et 9 aux tests
"""
def my_train_test_split(df):
    X_test = df[(df['id'] > 6)]
    X_train = df[(df['id'] < 7 )]
    
    y_test = X_test['eventType']
    y_train = X_train['eventType']
    X_test.drop(["eventType"], axis=1, inplace=True)
    X_train.drop(["eventType"], axis=1, inplace=True)

    X_test.drop(["id", "index"], axis=1, inplace=True)
    X_train.drop(["id", "index"], axis=1, inplace=True)
   
    return X_train, X_test, y_train, y_test

""" extraction de caractéristiques 
    Passage en mode fréquentiel sur C3 C4 sur une durée de 1 à 3s sur chaque epoch
    Calcul de la différence C3-C4
    Mise à L'échelle des données
"""
def extract_features(epocks):
    freqs = np.arange(10.5, 12.5,1)
    dfs = []
    idx = 0
    for e in epocks:
        tfr =  e.compute_tfr(tmin= 1, tmax=3,
                            method="morlet", freqs=freqs, n_cycles=freqs/2, return_itc=False, picks=["C3", "C4"],
                            average=False)
        df = tfr.to_data_frame()
        df["id"] = idx
        df.rename(columns={"condition": "eventType"}, inplace=True)
        df.astype({"eventType": int})
        dfs.append(df)
        idx += 1

    df = pd.concat(dfs)
    df["C3-C4"] = df["C3"] - df["C4"]
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(df[["C3", "C4","C3-C4"]])
    df[["C3", "C4","C3-C4"]] = scaler.transform(df[["C3", "C4","C3-C4"]])
    # Transformation du dataframe pour obtenir une ligne de caractéristiques labellisée par tentative 
    dfEvent= df[['id', 'epoch', 'eventType']]
    dfEvent.drop_duplicates(inplace=True)
    dfEvent.set_index(['id', 'epoch'], inplace=True)
    df = df.astype({'freq': str, 'time':str})
    df.set_index(['id','epoch','time','freq'], inplace=True)
    df = df.drop(["C3","C4",'eventType'], axis=1)
    df = df.unstack()
    df = df.reset_index()
    df.set_index(['id','time','epoch'], inplace=True)
    df.columns = pd.MultiIndex.from_frame(pd.DataFrame(index=df.columns).reset_index().astype(str))
    df.columns = df.columns.map('_'.join)
    df = df.reset_index()
    df.set_index(['id','epoch', 'time'], inplace=True)
    df = df.unstack()
    df.columns = pd.MultiIndex.from_frame(pd.DataFrame(index=df.columns).reset_index().astype(str))
    df.columns = df.columns.map('_'.join)
    df = df.reset_index()
    dfEvent =  dfEvent.reset_index()
    df = pd.merge(df, dfEvent,on=['id','epoch'])
    df.reset_index(inplace=True)
    return df

In [34]:
ranges = [
    {"min": -0.5, "max": 3.5},
]
report = []
epocks = []

for r in ranges:
    print(
        "***********************************************************************************************"
    )
    for f in FILTERS:
        for ref in REFERENCE:
            for d in DATA_FILES:
                epocks.append(
                    file_preprocessing(
                        d,
                        tmin=r["min"],
                        tmax=r["max"],
                        lFilter=f["l_cut"],
                        hFilter=f["h_cut"],
                        reference=ref,
                    )
                )
            df = extract_features(epocks)
            X_train, X_test, y_train, y_test = my_train_test_split(df)
            report_entries = []
            apply_models(X_train, X_test, y_train, y_test, report_entries)
            report.append(
                {
                    "range": {"min": r["min"], "max": r["max"]},
                    "filter": {"low": f["l_cut"], "hight": f["h_cut"]},
                    "reference": r,
                    "entries": report_entries,
                }
            )
print_report(report)

***********************************************************************************************
Creating RawArray with float64 data, n_channels=4, n_times=469011
    Range : 0 ... 469010 =      0.000 ...  1876.040 secs
Ready.
['STI101']
160 events found on stim channel STI101
Event IDs: [1 2]
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 1 - 30 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 1.00
- Lower transition bandwidth: 1.00 Hz (-6 dB cutoff frequency: 0.50 Hz)
- Upper passband edge: 30.00 Hz
- Upper transition bandwidth: 7.50 Hz (-6 dB cutoff frequency: 33.75 Hz)
- Filter length: 825 samples (3.300 s)

['C3', 'Cz', 'C4', 'STI101']
160 events found on stim channel STI101
Event IDs: [1 2]
Not setting metadata
160 matching events found
Applying base

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfEvent.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(["eventType"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(["eventType"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(["id", "index"], ax

------------------------------------------------------------------------------------------
min: -0.5 max:  3.5 lowCut:  1 hightCut:  30 reference:  {'min': -0.5, 'max': 3.5}
------------------------------------------------------------------------------------------
LogisticRegression score:  0.80625
[[112  48]
 [ 14 146]]
              precision    recall  f1-score   support

           1       0.89      0.70      0.78       160
           2       0.75      0.91      0.82       160

    accuracy                           0.81       320
   macro avg       0.82      0.81      0.80       320
weighted avg       0.82      0.81      0.80       320

