In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import time
import shutil
from fastai.vision.all import *
from scipy.fft import fft, ifft
from scipy import interpolate

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, ShuffleSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from mne.decoding import CSP, SPoC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

In [2]:
path_data = 'files/MU.txt'

In [38]:
mnist_df = pd.read_csv(path_data, sep='\t', header=None, nrows=6000)
mnist_df.columns = ['id', 'event_id', 'device', 'channel', 'code', 'size', 'data']

resting_df = pd.read_csv(path_data, sep='\t', names=mnist_df.columns, header=None, skiprows=130000, nrows=2000)
df = pd.concat([mnist_df, resting_df])
df["data"] = df["data"].apply(lambda x: [float(i) for i in x.split(",")])

df.head()

Unnamed: 0,id,event_id,device,channel,code,size,data
0,978111,132669,MU,TP9,6,459,"[475.0, 474.0, 477.0, 486.0, 486.0, 476.0, 479.0, 483.0, 489.0, 483.0, 482.0, 485.0, 486.0, 483.0, 474.0, 475.0, 484.0, 481.0, 482.0, 478.0, 476.0, 479.0, 487.0, 481.0, 480.0, 479.0, 487.0, 486.0, 471.0, 487.0, 491.0, 488.0, 491.0, 470.0, 460.0, 485.0, 484.0, 482.0, 485.0, 480.0, 487.0, 488.0, 479.0, 471.0, 479.0, 485.0, 484.0, 484.0, 478.0, 478.0, 479.0, 511.0, 524.0, 529.0, 530.0, 529.0, 525.0, 524.0, 533.0, 520.0, 516.0, 520.0, 525.0, 526.0, 521.0, 519.0, 526.0, 527.0, 529.0, 520.0, 520.0, 514.0, 522.0, 523.0, 514.0, 515.0, 517.0, 523.0, 517.0, 515.0, 528.0, 522.0, 530.0, 520.0, 519.0, ..."
1,978112,132669,MU,FP1,6,459,"[468.0, 487.0, 493.0, 493.0, 498.0, 493.0, 491.0, 490.0, 492.0, 487.0, 483.0, 488.0, 489.0, 493.0, 494.0, 489.0, 488.0, 481.0, 492.0, 489.0, 483.0, 488.0, 491.0, 490.0, 491.0, 487.0, 487.0, 490.0, 481.0, 495.0, 493.0, 495.0, 497.0, 480.0, 476.0, 490.0, 489.0, 490.0, 491.0, 489.0, 492.0, 494.0, 494.0, 492.0, 489.0, 494.0, 492.0, 492.0, 489.0, 491.0, 490.0, 509.0, 523.0, 527.0, 523.0, 528.0, 523.0, 525.0, 527.0, 521.0, 524.0, 520.0, 519.0, 524.0, 521.0, 523.0, 522.0, 519.0, 513.0, 525.0, 528.0, 522.0, 519.0, 520.0, 521.0, 519.0, 515.0, 523.0, 514.0, 516.0, 530.0, 518.0, 528.0, 522.0, 517.0, ..."
2,978113,132669,MU,FP2,6,459,"[482.0, 475.0, 490.0, 500.0, 485.0, 470.0, 470.0, 482.0, 490.0, 484.0, 478.0, 480.0, 486.0, 490.0, 482.0, 473.0, 481.0, 488.0, 477.0, 480.0, 486.0, 487.0, 486.0, 484.0, 483.0, 479.0, 484.0, 483.0, 466.0, 482.0, 486.0, 484.0, 482.0, 476.0, 480.0, 494.0, 486.0, 482.0, 482.0, 480.0, 484.0, 488.0, 482.0, 482.0, 485.0, 482.0, 484.0, 485.0, 478.0, 481.0, 480.0, 515.0, 514.0, 516.0, 515.0, 521.0, 515.0, 508.0, 511.0, 509.0, 510.0, 514.0, 515.0, 513.0, 516.0, 517.0, 513.0, 513.0, 519.0, 516.0, 520.0, 517.0, 513.0, 513.0, 513.0, 520.0, 513.0, 517.0, 505.0, 508.0, 517.0, 509.0, 519.0, 511.0, 509.0, ..."
3,978114,132669,MU,TP10,6,459,"[470.0, 470.0, 478.0, 489.0, 487.0, 475.0, 469.0, 478.0, 488.0, 483.0, 474.0, 480.0, 487.0, 484.0, 474.0, 471.0, 481.0, 484.0, 485.0, 479.0, 461.0, 471.0, 493.0, 485.0, 473.0, 475.0, 481.0, 485.0, 467.0, 468.0, 481.0, 488.0, 484.0, 460.0, 470.0, 473.0, 482.0, 485.0, 472.0, 468.0, 484.0, 486.0, 479.0, 464.0, 471.0, 486.0, 485.0, 471.0, 467.0, 482.0, 488.0, 517.0, 522.0, 526.0, 530.0, 531.0, 506.0, 510.0, 530.0, 525.0, 506.0, 506.0, 520.0, 527.0, 523.0, 516.0, 519.0, 529.0, 527.0, 532.0, 514.0, 515.0, 523.0, 530.0, 515.0, 508.0, 504.0, 530.0, 524.0, 512.0, 525.0, 521.0, 533.0, 511.0, 504.0, ..."
4,978115,132670,MU,TP9,7,493,"[506.0, 499.0, 495.0, 491.0, 492.0, 507.0, 496.0, 500.0, 498.0, 496.0, 499.0, 500.0, 496.0, 502.0, 506.0, 504.0, 500.0, 486.0, 505.0, 500.0, 497.0, 496.0, 497.0, 502.0, 498.0, 497.0, 498.0, 502.0, 501.0, 498.0, 503.0, 499.0, 502.0, 492.0, 493.0, 496.0, 511.0, 508.0, 503.0, 495.0, 497.0, 505.0, 501.0, 490.0, 491.0, 504.0, 503.0, 501.0, 502.0, 500.0, 504.0, 499.0, 506.0, 497.0, 498.0, 507.0, 512.0, 504.0, 509.0, 501.0, 500.0, 507.0, 502.0, 499.0, 501.0, 501.0, 499.0, 497.0, 489.0, 497.0, 496.0, 489.0, 502.0, 488.0, 507.0, 508.0, 493.0, 497.0, 496.0, 500.0, 501.0, 496.0, 492.0, 496.0, 504.0, ..."


In [8]:
len(df)

8000

In [50]:

# Function to resample an array to the target length
def resample_array(array, target_length):
    array = np.array(array)
    # Create an array of indices for the input array
    input_indices = np.linspace(0, len(array)-1, len(array))
    # Create an array of indices for the resampled array
    resampled_indices = np.linspace(0, len(array)-1, target_length) # shape == (target_length,)
    # print(resampled_indices.shape)
    # return

    # Create a linear interpolation function based on the input array
    interpolator = interpolate.interp1d(input_indices, array)

    # Use the interpolator to create the resampled array
    resampled_array = interpolator(resampled_indices)

    return resampled_array.tolist()

median_length=150
# Resample all the data arrays to the median length
df["resampled_data"] = df["data"].apply(lambda x: resample_array(x, median_length))

# Check the length of the resampled arrays
df["resampled_data_length"] = df["resampled_data"].apply(len)

# Display the first few rows of the updated DataFrame
df["resampled_data_length"].shape

data_array = np.array(df["resampled_data"].tolist())
codes = df['code'].tolist()

data_array = np.reshape(data_array, (-1, 4, data_array.shape[1]))
data_array = np.mean(data_array, axis=1)
codes = codes[::4]
data_array.shape, len(codes)

((2000, 150), 2000)

In [51]:
data_array = np.array(df["resampled_data"].tolist())
codes = df['code'].tolist()

In [52]:
data_array.shape, len(codes)

((8000, 150), 8000)

In [53]:
data_array = np.reshape(data_array, (-1, 4, data_array.shape[1]))
data_array = np.mean(data_array, axis=1)
codes = codes[::4]
data_array.shape, len(codes)

((2000, 150), 2000)

In [54]:
X = data_array
# normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
le = preprocessing.LabelEncoder()
le.fit(codes)
y = le.transform(codes)

In [55]:
shuffle_split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [56]:
models = [
    ('SVM', SVC(), {'model__C': [0.5, 1, 3], 'model__kernel': ['linear']}),
    ('KNN', KNeighborsClassifier(), {'model__n_neighbors': [4,5,6]}),
    ('Random Forest', RandomForestClassifier(), {'model__n_estimators': [50,100]}),
    ('MLP', MLPClassifier(), {'model__hidden_layer_sizes': [(100, 50), (200, 100)]}),
    ('Decision Tree', DecisionTreeClassifier(), {'model__max_depth': [50, 100]}),
]

In [57]:
pipelines = []
for name, model, param_grid in models:
    pipeline = Pipeline([
        ('model', model)
    ])
    pipelines.append((name, pipeline, param_grid))

In [58]:
results = []
for name, pipeline, param_grid in pipelines:
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=shuffle_split, n_jobs=-1)
    grid_search.fit(X, y)
    results.append((name, grid_search))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

In [59]:
res_grid = []
for name, grid_search in results:
    print(f"Model: {name}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validated Accuracy: {grid_search.best_score_:.2f}")
    print("\n")

Model: SVM
Best Parameters: {'model__C': 0.5, 'model__kernel': 'linear'}
Best Cross-Validated Accuracy: 0.23


Model: KNN
Best Parameters: {'model__n_neighbors': 6}
Best Cross-Validated Accuracy: 0.19


Model: Random Forest
Best Parameters: {'model__n_estimators': 100}
Best Cross-Validated Accuracy: 0.25


Model: MLP
Best Parameters: {'model__hidden_layer_sizes': (200, 100)}
Best Cross-Validated Accuracy: 0.23


Model: Decision Tree
Best Parameters: {'model__max_depth': 50}
Best Cross-Validated Accuracy: 0.20




Result is not realy good. We can try to train with binary classification. is digit = 1 or not = 0.

In [60]:
X = data_array
# transform codes -1 to 0 and others to 1
y = np.array([1 if code == -1 else 0 for code in codes])

np.unique(y, return_counts=True)

(array([0, 1]), array([1500,  500]))

In [61]:
results = []
for name, pipeline, param_grid in pipelines:
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=shuffle_split, n_jobs=-1)
    grid_search.fit(X, y)
    results.append((name, grid_search))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

In [62]:
res_grid = []
for name, grid_search in results:
    print(f"Model: {name}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validated Accuracy: {grid_search.best_score_:.2f}")
    print("\n")

Model: SVM
Best Parameters: {'model__C': 3, 'model__kernel': 'linear'}
Best Cross-Validated Accuracy: 0.75


Model: KNN
Best Parameters: {'model__n_neighbors': 5}
Best Cross-Validated Accuracy: 0.82


Model: Random Forest
Best Parameters: {'model__n_estimators': 100}
Best Cross-Validated Accuracy: 0.87


Model: MLP
Best Parameters: {'model__hidden_layer_sizes': (100, 50)}
Best Cross-Validated Accuracy: 0.70


Model: Decision Tree
Best Parameters: {'model__max_depth': 50}
Best Cross-Validated Accuracy: 0.80




Add new models:

In [64]:
models = [
    ('KNN', KNeighborsClassifier(), {'model__n_neighbors': [4,5,6]}),
    ('Random Forest', RandomForestClassifier(), {'model__n_estimators': [50,100]}),
    ('MLP', MLPClassifier(), {'model__hidden_layer_sizes': [(100, 50), (200, 100)]}),
    ('Decision Tree', DecisionTreeClassifier(), {'model__max_depth': [50, 100]}),
    ('Gradient Boosting', GradientBoostingClassifier(), {'model__n_estimators': [15, 25, 50, 100]}),
    ('Linear discriminant analysis', LinearDiscriminantAnalysis(), {'model__solver': ['svd', 'lsqr', 'eigen'], 'model__tol': [0.0001, 0.00001]}),
    ('XGB', XGBClassifier(), {'model__n_estimators': [50, 100, 200, 300], 'model__learning_rate': [0.05, 0.001]})
]

pipelines = []
for name, model, param_grid in models:
    pipeline = Pipeline([
        ('model', model)
    ])
    pipelines.append((name, pipeline, param_grid))

In [65]:
results = []
for name, pipeline, param_grid in pipelines:
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=shuffle_split, n_jobs=-1)
    grid_search.fit(X, y)
    results.append((name, grid_search))

res_grid = []
for name, grid_search in results:
    print(f"Model: {name}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validated Accuracy: {grid_search.best_score_:.2f}")
    print("\n")

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

Model: KNN
Best Parameters: {'model__n_neighbors': 5}
Best Cross-Validated Accuracy: 0.82


Model: Random Forest
Best Parameters: {'model__n_estimators': 100}
Best Cross-Validated Accuracy: 0.86


Model: MLP
Best Parameters: {'model__hidden_layer_sizes': (100, 50)}
Best Cross-Validated Accuracy: 0.56


Model: Decision Tree
Best Parameters: {'model__max_depth': 50}
Best Cross-Validated Accuracy: 0.79


Model: Gradient Boosting
Best Parameters: {'model__n_estimators': 100}
Best Cross-Validated Accuracy: 0.85


Model: Linear discriminant analysis
Best Parameters: {'model__solver': 'svd', 'model__tol': 0.0001}
Best Cross-Validated Accuracy: 0.76


Model: XGB
Best Parameters: {'model__learning_rate': 0.05, 'model__n_estimators': 300}
Best Cross-Validated Accuracy: 0.85


