## How to Create Environment first on your terminal 
1. conda create -n EOG_classification
2. conda activate EOG_classification
3. conda install catboost sklearn scipy numpy pywt

# import libraries

In [1]:
#!conda install catboost
import pandas as pd
from scipy import signal
from scipy import stats
from scipy.signal import butter,filtfilt
import matplotlib.pyplot as plt
import numpy as np
# conda install scikit-learn
from sklearn.decomposition import PCA 
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pywt  #conda install pywavelets

from catboost import CatBoostClassifier

# Helper functions

In [2]:
from scipy.signal import welch
def compute_psd(signal, fs):
    freqs, psd = welch(signal, fs=fs,nperseg=12)
    return psd.tolist()

In [31]:
def butter_bandpass_filter(input,low_cutoff,high_cutoff,sampling_rate,order):
  nyq=0.5*sampling_rate
  low=low_cutoff/nyq
  high=high_cutoff/nyq
  numerator,denominator=butter(order,[low,high],btype='band',output='ba',analog=False,fs=None)
  filterd=filtfilt(numerator,denominator,input)
  return filterd

def mean(data):
    return np.mean(data,axis=-1)    

def var(data):
        return np.var(data,axis=-1)

def skewness(data):
    return stats.skew(data,axis=-1)

def kurtosis(data):
    return stats.kurtosis(data,axis=-1)

def concatenate_features(data):
    return [mean(data), var(data), skewness(data), kurtosis(data)]

# Downsample 250 values per row to 50 values per row  

# def pre_processing_Of_data(data, colss):
#   # data=pd.read_csv(EOG_dataframe)
#   #vertical_data=data.drop(data[(data['polarity']=='h')].index, inplace=False)
#   data_framee = pd.DataFrame(columns = colss)
#   for lenn in range(len(data)):
#     n_row = data.iloc[[lenn]]
#     label_ = str(data.iloc[lenn]['label'])
#     #vertical_horizontal = str(data.iloc[lenn]['polarity'])
#     #n_row=n_row.drop(columns=['id','label']) original
#     n_row=n_row.drop(columns=['label'])
#     n_row = n_row.values.tolist()
#     n_row = [item for sublist in n_row for item in sublist]
#     filterd_sig=butter_bandpass_filter(n_row,low_cutoff=0.2,high_cutoff=40.0,sampling_rate=250,order=2)
    
#     # #Down sampling
#     resampled__sig=signal.resample(filterd_sig,50)
#     flatten_array=resampled__sig.flatten()
#     new_row = pd.DataFrame([[label_]  + list(flatten_array)], columns=colss)

#     # concatenate the dataframes
#     data_framee = pd.concat([data_framee, new_row], ignore_index=True)
#   return data_framee


def pre_processing_Of_data_noDownsample(data, colss):
    data_framee = pd.DataFrame(columns=colss)
    for lenn in range(len(data)):
        n_row = data.iloc[[lenn]]
        label_ = str(data.iloc[lenn]['label'])
        n_row = n_row.drop(columns=['label'])
        n_row = n_row.values.tolist()
        n_row = [item for sublist in n_row for item in sublist]
        filterd_sig = butter_bandpass_filter(n_row, low_cutoff=0.2, high_cutoff=35.0, sampling_rate=250, order=2)
        
        # Removed downsampling step
        resampled__sig = filterd_sig
        flatten_array = resampled__sig.flatten()
        new_row = pd.DataFrame([[label_] + list(flatten_array)], columns=colss)
        
        # Concatenate the dataframes
        data_framee = pd.concat([data_framee, new_row], ignore_index=True)
        
    return data_framee


def create_statistical_features(coeffs, level):  

  feature_row = []
  for i in range(level + 1):
    feature_row = feature_row + concatenate_features(coeffs[i])

  return feature_row

# Select certain columns only for modeling 
# goes through data frame and drops 'id' and 'label' column 
# Uses Fourier method to resample filtered signal from buttterworth bandpass filter 
# Create and returns

def get_features(data, level,col):
  #all_coeffs = [] # each element in this list contains 4 lists for the 4 levels
  #data = pre_processed_data.drop(pre_processed_data[(pre_processed_data['polarity'] == polarity)].index, inplace=False)
  features_dataframe = pd.DataFrame(columns = col)
  for i in range(len(data)):
    row = data.iloc[[i]]
    label_ = str(data.iloc[i]['label'])
    #vertical_horizontal = str(data.iloc[i]['polarity'])
    row = row.drop(columns=['label'])
    row = row.values.tolist()
    row = [item for sublist in row for item in sublist]
    wavelet = 'db4' # Daubechies 4
    coeffs = pywt.wavedec(row, wavelet, level = level)

    frow=create_statistical_features(coeffs,level)
    #psd
    new_row = pd.DataFrame([[label_] + list(frow)], columns=col)
    features_dataframe = pd.concat([features_dataframe, new_row], ignore_index=True)

  return features_dataframe

def create_statistical_features(coeffs, level):  

  feature_row = []
  for i in range(level + 1):
    feature_row = feature_row + concatenate_features(coeffs[i])

  return feature_row

def ploting_signal(dataframe, nrow, data_or_pre):
  n_row = dataframe.iloc[[nrow]]
  #condition = lambda x: n_row.drop(columns=['id','label']) if x == 1 else n_row.drop(columns=['label'])
  condition = lambda x: n_row.drop(columns=['instruction','label']) if x == 1 else n_row.drop(columns=['label'])
  n_row=condition(data_or_pre)
  n_row = n_row.values.tolist()
  n_row = [item for sublist in n_row for item in sublist]
  plt.figure(figsize=(12,6))
  plt.plot(np.arange(0,len(n_row)),n_row)
  plt.xlabel("t")
  plt.ylabel("a")
  plt.show()
  

def featureScalling(X): 
  for column in X.columns:
    X[column] = (X[column] - X[column].min()) /(X[column].max() - X[column].min())
  return X
  

# Models

In [28]:
def LogisticRegressionModel(X_train,y_train,X_test,y_test,iter=10000):
    model =LogisticRegression(max_iter=iter)
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    print('Mean Square Error:', metrics.mean_squared_error(y_test, prediction))
    print('Accuracy :', metrics.accuracy_score(y_test, prediction)) 
    return model,prediction

def cat_boost(X_train,y_train,X_test,y_test):
  cat_clf = CatBoostClassifier()
  cat_clf.fit(X_train,y_train)
  preds = cat_clf.predict(X_test)
  print('Accuracy :', metrics.accuracy_score(y_test, preds)) 
  return cat_clf,preds

def random_forest(X_train,y_train,X_test,y_test):
  rf = RandomForestClassifier()
  rf.fit(X_train,y_train)
  preds = rf.predict(X_test)
  print('Accuracy :', metrics.accuracy_score(y_test, preds)) 
  return rf,preds

# Choose Subject to Process

In [13]:
sub_number = '11'
session_num = 's0'

# Load Data

In [14]:
! pwd ## Fix appropriate folder below

/Users/michellekim/Documents/GitHub/EOG_classification/NoteBook


In [15]:
import pandas as pd
import glob
import os


# Define the folder path
folder_path = 'data_BlinkGaze/sub'+sub_number+session_num+'/blinkGazeStudy/EOG0_reformat'

# Use glob to find all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty list to hold the DataFrames
dataframes = []

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    df["channel/polarity"] = "EOG0"
    new_column_names = ["label"] + [f'Sample{i}' for i in range(251)]
    df.columns = new_column_names
    dataframes.append(df)


folder_path = 'data_BlinkGaze/sub'+sub_number+ session_num+ '/blinkGazeStudy/EEG0_reformat'
# Use glob to find all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    df["channel/polarity"] = "EEG0"
    new_column_names = ["label"] + [f'Sample{i}' for i in range(251)]
    df.columns = new_column_names
    dataframes.append(df)


folder_path = 'data_BlinkGaze/sub'+sub_number+ session_num+ '/blinkGazeStudy/EEG1_reformat'
# Use glob to find all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    df["channel/polarity"] = "EEG1"
    new_column_names = ["label"] + [f'Sample{i}' for i in range(251)]
    df.columns = new_column_names
    dataframes.append(df)

In [16]:
dataframes

[            label        Sample0        Sample1        Sample2        Sample3  \
 0         unknown -671097.010634 -671060.802860 -671070.595086 -671161.387311   
 1         unknown -668334.067036 -668325.859262 -668278.651488 -668280.443713   
 2         unknown -668212.088682 -668247.880907 -668208.673133 -668180.465359   
 3         unknown -668114.110327 -668157.902553 -668170.694779 -668552.487004   
 4         unknown -668070.131973 -668022.924198 -668020.716424 -667970.508650   
 5         unknown -668004.153618 -668000.945844 -667940.738069 -667928.530295   
 6      whiteLabel -667612.175264 -667647.967489 -667798.759715 -668050.551941   
 7         unknown -667436.196909 -667414.989135 -667434.781360 -667505.573586   
 8      blinkLabel -667282.218555 -667246.010780 -667175.803006 -667184.595232   
 9         unknown -667000.240200 -667027.032426 -667199.824651 -667000.616877   
 10     whiteLabel -666712.261846 -666713.054071 -666706.846297 -667087.638522   
 11        unkno

In [17]:
# Concatenate all DataFrames into one big DataFrame
all_dataframe = pd.concat(dataframes, ignore_index=True)


data = all_dataframe
data

Unnamed: 0,label,Sample0,Sample1,Sample2,Sample3,Sample4,Sample5,Sample6,Sample7,Sample8,...,Sample241,Sample242,Sample243,Sample244,Sample245,Sample246,Sample247,Sample248,Sample249,Sample250
0,unknown,-671097.010634,-671060.802860,-671070.595086,-671161.387311,-671507.179537,-671023.971762,-671024.763988,-671290.556214,-671351.348439,...,-668385.937006,-668392.729231,-668529.521457,-668676.313683,-668303.105908,-668332.898134,-668323.690360,-668276.482585,-668295.274811,EOG0
1,unknown,-668334.067036,-668325.859262,-668278.651488,-668280.443713,-668419.235939,-668736.028164,-668245.820390,-668227.612616,-668215.404841,...,-668294.958651,-668257.750877,-668266.543103,-668482.335328,-668726.127554,-668253.919779,-668272.712005,-668241.504231,-668193.296456,EOG0
2,unknown,-668212.088682,-668247.880907,-668208.673133,-668180.465359,-668197.257584,-668282.049810,-668643.842035,-668155.634261,-668104.426487,...,-668290.980297,-668256.772522,-668233.564748,-668248.356974,-668405.149199,-668659.941425,-668183.733650,-668176.525876,-668178.318102,EOG0
3,unknown,-668114.110327,-668157.902553,-668170.694779,-668552.487004,-668289.279230,-668089.071455,-668163.863681,-668559.655907,-668130.448132,...,-668166.001942,-668184.794168,-668544.586394,-668176.378619,-668106.170845,-668114.963070,-668079.755296,-668058.547522,-668081.339747,EOG0
4,unknown,-668070.131973,-668022.924198,-668020.716424,-667970.508650,-668340.300875,-668202.093101,-667930.885326,-667964.677552,-667931.469778,...,-668100.023588,-668080.815813,-668046.608039,-668511.400265,-668178.192490,-668023.984716,-668040.776941,-667997.569167,-667978.361393,EOG0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2524,whiteLabel,10304.604623,10240.858489,10266.112356,10282.366222,10275.620088,10176.873954,10167.127821,10293.381687,10200.635553,...,16171.698828,16046.952694,16069.206561,15984.460427,15953.714293,15908.968159,16152.222026,16132.475892,16061.729758,EEG1
2525,unknown,16052.983625,16017.237491,15982.491357,15922.745223,16107.999090,16054.252956,16075.506822,16041.760688,16072.014555,...,17170.077830,17140.331696,16933.585562,17119.839428,17224.093295,17183.347161,17234.601027,17201.854893,17203.108760,EEG1
2526,gazeLeftLabel,17187.362626,17234.616492,17223.870359,17176.124225,17058.378091,17169.631957,17217.885824,17187.139690,17199.393556,...,19137.456831,19108.710697,19108.964564,19012.218430,19113.472296,19090.726162,19000.980029,19113.233895,19246.487761,EEG1
2527,unknown,19161.741627,19131.995494,19224.249360,19159.503226,19067.757093,18841.010959,19046.264825,19210.518691,19108.772558,...,19295.835833,19249.089699,19278.343565,19236.597431,19159.851298,19329.105164,19359.359030,19362.612896,19360.866763,EEG1


# Choose Labels to Use

In [41]:
data['label'].unique()

array(['unknown', 'whiteLabel', 'blinkLabel', 'gazeLeftLabel',
       'gazeRightLabel'], dtype=object)

In [18]:
# Define the labels of interest
labels_of_interest = ["gazeLRLabel", "white", "blinkLabel"]

# Filter the DataFrame
data_interest = data[data['label'].isin(labels_of_interest)]
data_interest

Unnamed: 0,label,Sample0,Sample1,Sample2,Sample3,Sample4,Sample5,Sample6,Sample7,Sample8,...,Sample241,Sample242,Sample243,Sample244,Sample245,Sample246,Sample247,Sample248,Sample249,Sample250
8,blinkLabel,-667282.218555,-667246.010780,-667175.803006,-667184.595232,-667316.387457,-667657.179683,-667169.971908,-667142.764134,-667157.556360,...,-667596.110170,-667299.902395,-667104.694621,-667273.486847,-667469.279072,-667033.071298,-667042.863523,-667051.655749,-666983.447975,EOG0
25,blinkLabel,-658503.586528,-658458.378753,-658421.170979,-658349.963205,-658630.755430,-658740.547656,-658299.339881,-658324.132107,-658295.924333,...,-657243.478143,-657234.270368,-657195.062594,-657405.854820,-657342.647045,-657222.439271,-657247.231497,-657216.023722,-657190.815948,EOG0
35,blinkLabel,-649152.802983,-649126.595208,-649112.387434,-649075.179659,-649546.971885,-649115.764111,-649026.556336,-649044.348562,-649025.140788,...,-648252.694598,-648276.486823,-648310.279049,-648750.071274,-648214.863500,-648172.655726,-648163.447951,-648119.240177,-648114.032403,EOG0
39,blinkLabel,-644866.889565,-644849.681790,-644794.474016,-644809.266241,-644918.058467,-644977.850693,-644729.642918,-644746.435144,-644732.227369,...,-643868.781180,-643834.573405,-643764.365631,-644092.157856,-644074.950082,-643714.742308,-643745.534533,-643726.326759,-643655.118984,EOG0
44,blinkLabel,-635828.032548,-635833.824774,-635979.617000,-635923.409225,-635723.201451,-635754.993677,-635735.785902,-635669.578128,-635708.370353,...,-634594.924163,-635003.716389,-634769.508615,-634486.300840,-634483.093066,-634469.885292,-634402.677517,-634435.469743,-634438.261968,EOG0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2497,blinkLabel,10122.284020,10116.537886,10096.791753,9950.045619,10124.299485,10218.553351,10152.807218,10084.061084,10149.314950,...,11066.378225,10935.632091,10831.885958,11032.139824,11025.393690,11002.647556,11011.901423,11071.155289,11047.409155,EEG1
2508,blinkLabel,4316.453036,4333.706903,4266.960769,4181.214635,4233.468501,4397.722368,4394.976234,4321.230100,4373.483966,...,9357.547241,9386.801108,9251.054974,9330.308840,9398.562706,9466.816573,9500.070439,9435.324305,9514.578172,EEG1
2511,blinkLabel,16384.590041,16389.843907,16382.097773,16243.351640,16393.605506,16451.859372,16480.113238,16393.367105,16457.620971,...,16528.684246,16456.938112,16349.191978,16576.445845,16632.699711,16650.953577,16635.207443,16621.461310,16558.715176,EEG1
2515,blinkLabel,24550.106047,24592.359913,24597.613779,24627.867645,24589.121512,24645.375378,24568.629244,24352.883110,24514.136977,...,8408.200252,8243.454118,7895.707984,8151.961851,8052.215717,8149.469583,8258.723449,8289.977316,8284.231182,EEG1


In [None]:
data_interest

# Pre Processing

**Create an Empty Data Frame**

In [19]:
cols = ["label"] + [f"sample{i}" for i in range(50)]
cols_noDownsample = ["label"] + [f"sample{i}" for i in range(250)]
data_interest= data_interest.iloc[:, 0:251]

#pre_processed_data=pre_processing_Of_data(data, cols) --> USE THIS IF YOU WANT TO DOWNSAMPLE TO 50 
pre_processed_data_noDownsample = pre_processing_Of_data_noDownsample(data_interest, cols_noDownsample)
pre_processed_data_noDownsample

  data_framee = pd.concat([data_framee, new_row], ignore_index=True)


Unnamed: 0,label,sample0,sample1,sample2,sample3,sample4,sample5,sample6,sample7,sample8,...,sample240,sample241,sample242,sample243,sample244,sample245,sample246,sample247,sample248,sample249
0,blinkLabel,554.819034,588.465541,592.159423,554.090671,501.332999,488.809825,541.438487,625.212035,690.255922,...,-528.576845,-533.960316,-523.959096,-505.780700,-491.463148,-462.093934,-404.268261,-338.895537,-287.587075,-248.590484
1,blinkLabel,-449.858753,-420.467287,-417.614486,-449.164501,-485.107011,-476.303880,-411.505046,-329.145613,-267.694095,...,-268.873888,-241.628881,-247.396759,-277.471208,-309.396865,-318.931369,-303.156090,-277.257950,-252.583065,-230.391625
2,blinkLabel,-817.019283,-819.397864,-841.641552,-879.011496,-894.260059,-858.667689,-792.460545,-735.599703,-703.263756,...,-384.183899,-399.949440,-442.765579,-491.658821,-498.886349,-441.263182,-352.453480,-280.011651,-239.838937,-220.681241
3,blinkLabel,-1090.234120,-1069.757804,-1063.842577,-1075.639683,-1087.508310,-1073.807122,-1030.433753,-977.590110,-932.875337,...,2944.240259,2307.161629,1970.479936,1867.338607,1886.647485,1972.700035,2089.935621,2199.518949,2284.057049,2352.570613
4,blinkLabel,-750.729934,-798.586756,-815.096848,-778.031649,-691.360751,-585.976159,-516.464045,-579.031840,-905.365187,...,-389.494386,-461.568943,-507.213230,-481.856539,-400.007387,-314.466986,-260.184578,-241.732380,-250.444207,-273.559024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,blinkLabel,-139.878986,-171.251229,-193.630368,-193.945090,-170.836021,-144.458705,-133.025358,-130.933387,-123.272899,...,-144.286693,-165.300056,-203.280850,-226.510729,-221.722756,-203.136151,-186.973746,-176.703961,-171.557505,-170.646230
335,blinkLabel,-1083.950509,-1125.434885,-1161.307317,-1176.630094,-1161.918573,-1131.051096,-1108.919637,-1105.442288,-1115.136527,...,-31.178093,-48.537412,-76.747731,-100.914961,-105.045706,-89.196536,-69.424320,-59.147712,-57.473428,-57.492463
336,blinkLabel,-37.857148,-58.645122,-74.291228,-72.819369,-48.636514,-15.490946,7.826986,18.023459,23.182109,...,-213.866664,-229.691203,-245.567628,-233.830380,-187.422349,-130.651520,-94.550327,-93.029476,-122.921337,-171.228691
337,blinkLabel,2972.313236,3035.640787,3090.933941,3131.626831,3152.701472,3152.631344,3143.666162,3156.329276,3213.155006,...,-1333.132071,-1295.358083,-1329.369810,-1368.153636,-1351.132649,-1269.905480,-1148.869647,-1019.134010,-903.663711,-804.717276


# Uncomment below if want to explore data more

In [194]:
# def detrend_data_new(data, order): 
#     """
#     Polyfit data with specified order and detrend each channel 
#     """
#     detrended_data = data.copy()
#     indices = np.arange(len(detrended_data))
    
#     for channel in ['eeg0', 'eeg1', 'eog0']:
#         detrended_data[channel] = detrended_data[channel].interpolate(method='linear')
#         coefficients = np.polyfit(indices, detrended_data[channel], order)
#         trend = np.polyval(coefficients, indices)
#         detrended_data[channel] = detrended_data[channel] - trend
#     return detrended_data

# ploting_signal(pre_processed_data,0,0)
# ploting_signal(pre_processed_data_noDownsample,5,0)



# Feature Extraction (Change range 20 --> 10 if downsample etc)

In [20]:
cols_features = ["label"] + [f"col{i}" for i in range(20)]
pre_processed_data_noDownsample

Unnamed: 0,label,sample0,sample1,sample2,sample3,sample4,sample5,sample6,sample7,sample8,...,sample240,sample241,sample242,sample243,sample244,sample245,sample246,sample247,sample248,sample249
0,blinkLabel,554.819034,588.465541,592.159423,554.090671,501.332999,488.809825,541.438487,625.212035,690.255922,...,-528.576845,-533.960316,-523.959096,-505.780700,-491.463148,-462.093934,-404.268261,-338.895537,-287.587075,-248.590484
1,blinkLabel,-449.858753,-420.467287,-417.614486,-449.164501,-485.107011,-476.303880,-411.505046,-329.145613,-267.694095,...,-268.873888,-241.628881,-247.396759,-277.471208,-309.396865,-318.931369,-303.156090,-277.257950,-252.583065,-230.391625
2,blinkLabel,-817.019283,-819.397864,-841.641552,-879.011496,-894.260059,-858.667689,-792.460545,-735.599703,-703.263756,...,-384.183899,-399.949440,-442.765579,-491.658821,-498.886349,-441.263182,-352.453480,-280.011651,-239.838937,-220.681241
3,blinkLabel,-1090.234120,-1069.757804,-1063.842577,-1075.639683,-1087.508310,-1073.807122,-1030.433753,-977.590110,-932.875337,...,2944.240259,2307.161629,1970.479936,1867.338607,1886.647485,1972.700035,2089.935621,2199.518949,2284.057049,2352.570613
4,blinkLabel,-750.729934,-798.586756,-815.096848,-778.031649,-691.360751,-585.976159,-516.464045,-579.031840,-905.365187,...,-389.494386,-461.568943,-507.213230,-481.856539,-400.007387,-314.466986,-260.184578,-241.732380,-250.444207,-273.559024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,blinkLabel,-139.878986,-171.251229,-193.630368,-193.945090,-170.836021,-144.458705,-133.025358,-130.933387,-123.272899,...,-144.286693,-165.300056,-203.280850,-226.510729,-221.722756,-203.136151,-186.973746,-176.703961,-171.557505,-170.646230
335,blinkLabel,-1083.950509,-1125.434885,-1161.307317,-1176.630094,-1161.918573,-1131.051096,-1108.919637,-1105.442288,-1115.136527,...,-31.178093,-48.537412,-76.747731,-100.914961,-105.045706,-89.196536,-69.424320,-59.147712,-57.473428,-57.492463
336,blinkLabel,-37.857148,-58.645122,-74.291228,-72.819369,-48.636514,-15.490946,7.826986,18.023459,23.182109,...,-213.866664,-229.691203,-245.567628,-233.830380,-187.422349,-130.651520,-94.550327,-93.029476,-122.921337,-171.228691
337,blinkLabel,2972.313236,3035.640787,3090.933941,3131.626831,3152.701472,3152.631344,3143.666162,3156.329276,3213.155006,...,-1333.132071,-1295.358083,-1329.369810,-1368.153636,-1351.132649,-1269.905480,-1148.869647,-1019.134010,-903.663711,-804.717276


In [38]:
pre_processed_data_noDownsample

Unnamed: 0,label,sample0,sample1,sample2,sample3,sample4,sample5,sample6,sample7,sample8,...,sample240,sample241,sample242,sample243,sample244,sample245,sample246,sample247,sample248,sample249
0,blinkLabel,554.819034,588.465541,592.159423,554.090671,501.332999,488.809825,541.438487,625.212035,690.255922,...,-528.576845,-533.960316,-523.959096,-505.780700,-491.463148,-462.093934,-404.268261,-338.895537,-287.587075,-248.590484
1,blinkLabel,-449.858753,-420.467287,-417.614486,-449.164501,-485.107011,-476.303880,-411.505046,-329.145613,-267.694095,...,-268.873888,-241.628881,-247.396759,-277.471208,-309.396865,-318.931369,-303.156090,-277.257950,-252.583065,-230.391625
2,blinkLabel,-817.019283,-819.397864,-841.641552,-879.011496,-894.260059,-858.667689,-792.460545,-735.599703,-703.263756,...,-384.183899,-399.949440,-442.765579,-491.658821,-498.886349,-441.263182,-352.453480,-280.011651,-239.838937,-220.681241
3,blinkLabel,-1090.234120,-1069.757804,-1063.842577,-1075.639683,-1087.508310,-1073.807122,-1030.433753,-977.590110,-932.875337,...,2944.240259,2307.161629,1970.479936,1867.338607,1886.647485,1972.700035,2089.935621,2199.518949,2284.057049,2352.570613
4,blinkLabel,-750.729934,-798.586756,-815.096848,-778.031649,-691.360751,-585.976159,-516.464045,-579.031840,-905.365187,...,-389.494386,-461.568943,-507.213230,-481.856539,-400.007387,-314.466986,-260.184578,-241.732380,-250.444207,-273.559024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,blinkLabel,-139.878986,-171.251229,-193.630368,-193.945090,-170.836021,-144.458705,-133.025358,-130.933387,-123.272899,...,-144.286693,-165.300056,-203.280850,-226.510729,-221.722756,-203.136151,-186.973746,-176.703961,-171.557505,-170.646230
335,blinkLabel,-1083.950509,-1125.434885,-1161.307317,-1176.630094,-1161.918573,-1131.051096,-1108.919637,-1105.442288,-1115.136527,...,-31.178093,-48.537412,-76.747731,-100.914961,-105.045706,-89.196536,-69.424320,-59.147712,-57.473428,-57.492463
336,blinkLabel,-37.857148,-58.645122,-74.291228,-72.819369,-48.636514,-15.490946,7.826986,18.023459,23.182109,...,-213.866664,-229.691203,-245.567628,-233.830380,-187.422349,-130.651520,-94.550327,-93.029476,-122.921337,-171.228691
337,blinkLabel,2972.313236,3035.640787,3090.933941,3131.626831,3152.701472,3152.631344,3143.666162,3156.329276,3213.155006,...,-1333.132071,-1295.358083,-1329.369810,-1368.153636,-1351.132649,-1269.905480,-1148.869647,-1019.134010,-903.663711,-804.717276


In [21]:
#featuered_data=get_featuers(pre_processed_data,4,cols_featuers)
featured_data=get_features(pre_processed_data_noDownsample,4,cols_features)

# labels_of_interest = ["gazeLeftLabel", "gazeRightLabel", "white", "blinkLabel"]
# featured_data = featured_data_pre[featured_data_pre['label'].isin(labels_of_interest)]
# featured_data

  features_dataframe = pd.concat([features_dataframe, new_row], ignore_index=True)


Unnamed: 0,label,col0,col1,col2,col3,col4,col5,col6,col7,col8,...,col10,col11,col12,col13,col14,col15,col16,col17,col18,col19
0,blinkLabel,-1296.466036,1.067450e+07,0.222384,-1.291118,32.592112,1.189482e+05,2.280310,5.395053,-6.205932,...,1.500367,8.228973,-0.758816,13162.649096,-0.831540,13.139023,-0.032388,161.783687,-1.907819,20.537317
1,blinkLabel,-189.842869,7.003066e+06,1.236684,0.698456,29.529061,2.546362e+05,2.199549,8.618868,11.347601,...,-0.691502,7.728076,-0.003364,14066.046043,-0.885858,11.268136,-0.029475,170.227596,-0.636890,16.362992
2,blinkLabel,39.575148,1.170459e+07,0.825663,-0.498510,21.397382,2.215027e+05,0.779026,5.179539,-2.944764,...,1.177342,6.351273,0.106804,15059.190841,-0.625604,9.680538,-0.101996,181.601029,-0.558885,10.736528
3,blinkLabel,3481.669928,4.914058e+07,0.417840,-1.095750,25.255673,7.208186e+04,0.170497,1.432489,9.344556,...,0.324745,6.877371,-2.264089,5638.517871,-0.565492,3.526673,-0.068800,178.481010,-0.576348,15.011622
4,blinkLabel,-5208.333636,1.126404e+07,-0.749231,-0.627901,-115.676147,5.501988e+05,-2.388153,10.169039,7.205881,...,1.043402,6.394120,1.036434,15213.643924,-0.631672,9.696969,-0.035787,179.508427,-0.708947,10.767143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,blinkLabel,-701.685301,1.012817e+05,-0.529889,0.411334,-2.781370,6.441756e+03,-0.103606,-0.510929,12.853780,...,-0.121712,-0.219756,1.475485,1025.765223,-0.177265,-0.556350,-0.024229,22.693861,0.196333,-0.358906
335,blinkLabel,-2481.433109,1.156753e+07,-1.086447,2.272646,-87.367679,3.230655e+05,-2.217313,6.631493,-29.209379,...,-2.272609,7.131287,0.358453,1962.169467,0.300762,2.351410,-0.044711,66.847472,2.238905,21.947897
336,blinkLabel,14.157715,2.576248e+05,-0.504344,-0.657163,22.361604,8.902340e+03,0.363547,-0.170898,1.000481,...,0.299097,-0.592113,0.964107,996.251619,0.008125,-0.917281,-0.039450,21.782616,0.319457,-0.165549
337,blinkLabel,7136.202174,1.739675e+08,-0.118979,-1.442889,258.025832,1.269819e+06,1.274142,2.543644,9.376490,...,1.543196,7.644947,-0.866442,40608.455476,-0.520915,12.999949,-0.044616,320.273527,0.142217,12.280348


array(['blinkLabel'], dtype=object)

# Label encoding

In [23]:
#label enconding for y
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = featured_data
encoded_Data=df

le = LabelEncoder()

df['encoded_label'] = le.fit_transform(df['label'])
df = df.drop(["label"], axis=1)
encoded_Data=df
encoded_Data

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col11,col12,col13,col14,col15,col16,col17,col18,col19,encoded_label
0,-1296.466036,1.067450e+07,0.222384,-1.291118,32.592112,1.189482e+05,2.280310,5.395053,-6.205932,69840.528778,...,8.228973,-0.758816,13162.649096,-0.831540,13.139023,-0.032388,161.783687,-1.907819,20.537317,0
1,-189.842869,7.003066e+06,1.236684,0.698456,29.529061,2.546362e+05,2.199549,8.618868,11.347601,115433.129430,...,7.728076,-0.003364,14066.046043,-0.885858,11.268136,-0.029475,170.227596,-0.636890,16.362992,0
2,39.575148,1.170459e+07,0.825663,-0.498510,21.397382,2.215027e+05,0.779026,5.179539,-2.944764,78338.847404,...,6.351273,0.106804,15059.190841,-0.625604,9.680538,-0.101996,181.601029,-0.558885,10.736528,0
3,3481.669928,4.914058e+07,0.417840,-1.095750,25.255673,7.208186e+04,0.170497,1.432489,9.344556,338144.943423,...,6.877371,-2.264089,5638.517871,-0.565492,3.526673,-0.068800,178.481010,-0.576348,15.011622,0
4,-5208.333636,1.126404e+07,-0.749231,-0.627901,-115.676147,5.501988e+05,-2.388153,10.169039,7.205881,84415.983738,...,6.394120,1.036434,15213.643924,-0.631672,9.696969,-0.035787,179.508427,-0.708947,10.767143,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,-701.685301,1.012817e+05,-0.529889,0.411334,-2.781370,6.441756e+03,-0.103606,-0.510929,12.853780,6364.064572,...,-0.219756,1.475485,1025.765223,-0.177265,-0.556350,-0.024229,22.693861,0.196333,-0.358906,0
335,-2481.433109,1.156753e+07,-1.086447,2.272646,-87.367679,3.230655e+05,-2.217313,6.631493,-29.209379,19931.215081,...,7.131287,0.358453,1962.169467,0.300762,2.351410,-0.044711,66.847472,2.238905,21.947897,0
336,14.157715,2.576248e+05,-0.504344,-0.657163,22.361604,8.902340e+03,0.363547,-0.170898,1.000481,5563.293501,...,-0.592113,0.964107,996.251619,0.008125,-0.917281,-0.039450,21.782616,0.319457,-0.165549,0
337,7136.202174,1.739675e+08,-0.118979,-1.442889,258.025832,1.269819e+06,1.274142,2.543644,9.376490,461436.320970,...,7.644947,-0.866442,40608.455476,-0.520915,12.999949,-0.044616,320.273527,0.142217,12.280348,0


In [42]:
!pwd

/Users/michellekim/Documents/GitHub/EOG_classification/NoteBook


# Train Model

In [32]:
X=encoded_Data.drop(["encoded_label"],axis=1)
X=featureScalling(X)
y=encoded_Data[["encoded_label"]]

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.4, random_state=40,stratify=y)

# convert data to numpy array
#X_np=X.values
#y_np=y.values
#y_np=encoded_labels





## Check how many labels and Train/Val/Test Size

In [36]:
y_train

Unnamed: 0,encoded_label
27,0
145,0
59,0
277,0
133,0
...,...
35,0
90,0
274,0
152,0


In [37]:
y["encoded_label"].unique()

array([0])

In [34]:
print("size of training examples:  "+str(len(y_train)))
print("size of dev examples:  "+str(len(y_dev)))

size of training examples:  203
size of dev examples:  136


In [35]:
X_val, X_test, y_val, y_test = train_test_split(X_dev, y_dev, test_size=0.5, random_state=40,stratify=y_dev)

In [202]:
print("size of validation  examples:  "+str(len(y_val)))
print("size of test examples:  "+str(len(y_test)))

size of validation  examples:  277
size of test examples:  278


## catboost Model

In [157]:
# from catboost import CatBoostClassifier, Pool
# from sklearn.model_selection import GridSearchCV

# # Prepare the data as a CatBoost Pool object
# train_pool = Pool(X_train, y_train)

# # Define the parameter grid for grid search
# param_grid = {
#     'iterations': [500],  # Number of boosting iterations
#     'learning_rate': [0.1, 0.01, 0.001],  # Learning rate
#     'depth': [4,5,6],  # Maximum depth of the trees
#     'l2_leaf_reg': [5, 10, 15 , 20 , 25]  # L2 regularization strength
# }

# # Create an instance of CatBoostClassifier
# model = CatBoostClassifier()

# # Perform grid search using cross-validation
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Get the best parameter combination and its corresponding performance
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# # Train the final model with the best parametersc
# final_model = CatBoostClassifier(**best_params)
# final_model.fit(X_train, y_train)
# # Make predictions on the test set
# y_pred = final_model.predict(X_test)


In [158]:
# best_params

In [159]:
# from catboost import CatBoostClassifier

# # Create an instance of CatBoostClassifier with regularization parameters
# model = CatBoostClassifier(
#     iterations=500,
#     learning_rate=best_params['learning_rate'],
#     l2_leaf_reg= best_params['l2_leaf_reg'],  # Set the L2 regularization strength
#     depth=5
# )

# # Train the model
# model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=False)

# # Make predictions
# train_preds=model.predict(X_train)
# print('train Accuracy :', metrics.accuracy_score(y_train, train_preds)) 

# test_preds=model.predict(X_val)
# print('val Accuracy :', metrics.accuracy_score(y_val, test_preds)) 

# test_preds=model.predict(X_test)
# print('test Accuracy :', metrics.accuracy_score(y_test, test_preds)) 


In [160]:
# model,preds=cat_boost(X_train,y_train,X_val,y_val)

In [161]:
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
# from sklearn.metrics import confusion_matrix


# # Define the labels for the confusion matrix
# labels = y_test['encoded_label'].unique()

# true_label=y_test

# #prediction=preds 
# prediction= y_pred
# # Create confusion matrix
# cm = confusion_matrix(true_label,prediction, labels=labels)

# # Create a dataframe from the confusion matrix
# cm_df = pd.DataFrame(cm, index=labels, columns=labels)

# # Create heatmap using Seaborn
# sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')

# # Set labels and title
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.title('Confusion Matrix')

# # Show the plot
# plt.show()


In [162]:
# model,preds=cat_boost(X_train,y_train,X_test,y_test)

In [461]:
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
# from sklearn.metrics import confusion_matrix


# # Define the labels for the confusion matrix
# labels = y_test['encoded_label'].unique()

# true_label=y_test

# prediction=preds
# # Create confusion matrix
# cm = confusion_matrix(true_label,prediction, labels=labels)

# # Create a dataframe from the confusion matrix
# cm_df = pd.DataFrame(cm, index=labels, columns=labels)

# # Create heatmap using Seaborn
# sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')

# # Set labels and title
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.title('Confusion Matrix')

# # Show the plot
# plt.show()


In [208]:
y_train

Unnamed: 0,encoded_label
238,0
1150,0
1256,0
1192,0
307,0
...,...
493,0
782,0
204,0
635,0


## Random Forest

In [203]:
model,preds=random_forest(X_train,y_train,X_val,y_val)

Accuracy : 1.0


  return fit_method(estimator, *args, **kwargs)


In [204]:
# Make predictions
train_preds=model.predict(X_train)
print('train Accuracy :', metrics.accuracy_score(y_train, train_preds)) 

test_preds=model.predict(X_val)
print('val Accuracy :', metrics.accuracy_score(y_val, test_preds)) 

test_preds=model.predict(X_test)
print('test Accuracy :', metrics.accuracy_score(y_test, test_preds)) 


train Accuracy : 1.0
val Accuracy : 1.0
test Accuracy : 1.0


In [205]:
test_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [166]:
!pwd

/Users/michellekim/Documents/GitHub/EOG_classification/NoteBook


In [206]:
X

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,col16,col17,col18,col19
0,0.641245,0.000338,0.510407,0.122509,0.748946,0.001013,0.125793,0.564362,0.700009,0.004494,0.161982,0.628954,0.651787,0.005464,0.464097,0.163202,0.610384,0.014762,0.267591,0.268599
1,0.640447,0.000299,0.446414,0.123084,0.779884,0.000308,0.782637,0.437269,0.694158,0.002136,0.640188,0.345155,0.620687,0.015750,0.424212,0.536822,0.581365,0.018081,0.187927,0.308911
2,0.643730,0.000370,0.531713,0.089267,0.768464,0.000332,0.739996,0.374290,0.728286,0.001826,0.616667,0.361426,0.583760,0.011474,0.418162,0.507059,0.635665,0.014031,0.089928,0.402604
3,0.645340,0.000344,0.474860,0.062228,0.714258,0.001547,0.130977,0.633416,0.775692,0.001061,0.699124,0.382291,0.621007,0.004069,0.548021,0.171481,0.671909,0.015965,0.225169,0.383527
4,0.647824,0.000235,0.407692,0.215798,0.719748,0.001497,0.000160,0.793614,0.724125,0.001950,0.594823,0.375204,0.604375,0.011592,0.441653,0.463153,0.621034,0.013313,0.225242,0.325023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1381,0.649379,0.000021,0.699413,0.268828,0.757355,0.000041,0.611431,0.189529,0.750009,0.000274,0.479934,0.037367,0.629534,0.005808,0.639662,0.036287,0.691218,0.011753,0.400239,0.019216
1382,0.650562,0.000106,0.400422,0.122953,0.756608,0.000014,0.604385,0.206398,0.740417,0.000332,0.491832,0.053942,0.621068,0.001361,0.629482,0.102296,0.686542,0.003600,0.367617,0.017010
1383,0.653835,0.000017,0.478140,0.116559,0.758352,0.000014,0.450893,0.009393,0.750300,0.000147,0.440848,0.047060,0.626444,0.000819,0.621587,0.044067,0.667772,0.002215,0.378036,0.016485
1384,0.627827,0.000825,0.606250,0.119302,0.774058,0.000606,0.532821,0.423664,0.728169,0.001424,0.560797,0.390612,0.628209,0.003518,0.894602,0.581293,0.636501,0.004103,0.420499,0.233924


In [170]:
predicted_blinks= 0 
predicted_leftgaze = 0
predicted_rightgaze=0 
sub_num = sub_number

if sub_num == '4' or sub_num =='24':
    folder_path = 'data_BlinkGaze/sub' + str(sub_num) + 's1/' + str(sub_num) + '_data_wordBlink.csv'
else:
    folder_path = 'data_BlinkGaze/sub' + str(sub_num) + 's0/' + str(sub_num) + '_data_wordBlink.csv'

df = pd.read_csv(folder_path)

# get input for model
X = pre_processed_data=pre_processing_Of_data_noDownsample(data_interest, cols_noDownsample)
featuered_data=get_features(pre_processed_data,4,cols_features)

featuered_data = featuered_data.drop(columns=['label'])
featuered_data = featuered_data.fillna(0)

# Make predictions
preds=model.predict(featuered_data)
preds = preds.squeeze()
print(preds)
for pred in preds:
    if pred == 0:
        predicted_blinks += 1
    elif pred == 1:
        predicted_leftgaze += 1
    else:
        predicted_rightgaze += 1

print('subject ' + str(sub_num))

print('blinks: ' +  str(predicted_blinks))
print('leftgaze: ' + str(predicted_leftgaze))
print('rightgaze: ' + str(predicted_rightgaze))

#print('----------------------------------------------')

  data_framee = pd.concat([data_framee, new_row], ignore_index=True)
  features_dataframe = pd.concat([features_dataframe, new_row], ignore_index=True)


[0 0 0 ... 2 2 2]
subject 6
blinks: 792
leftgaze: 0
rightgaze: 348


In [172]:
X_train

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,col16,col17,col18,col19
502,0.031668,2.349323e-07,0.131550,0.789123,0.342874,0.000039,0.547582,0.165223,0.032752,0.000031,0.567538,0.131747,0.963263,0.000030,0.453558,0.020802,0.084528,0.000029,0.436904,0.012299
39,0.033818,0.000000e+00,0.000000,0.000000,0.340872,0.000000,0.000000,0.000000,0.032073,0.000000,0.000000,0.000000,0.952295,0.000000,0.000000,0.000000,0.084323,0.000000,0.000000,0.000000
346,0.033818,0.000000e+00,0.000000,0.000000,0.340872,0.000000,0.000000,0.000000,0.032073,0.000000,0.000000,0.000000,0.952295,0.000000,0.000000,0.000000,0.084323,0.000000,0.000000,0.000000
508,0.029507,3.552536e-06,0.650231,0.123064,0.344614,0.000036,0.830326,0.561259,0.028684,0.000026,0.595888,0.076855,0.963096,0.000034,0.441074,0.010680,0.079490,0.000030,0.419506,0.008555
251,0.033818,0.000000e+00,0.000000,0.000000,0.340872,0.000000,0.000000,0.000000,0.032073,0.000000,0.000000,0.000000,0.952295,0.000000,0.000000,0.000000,0.084323,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
808,0.041029,1.355773e-05,0.777472,0.155407,0.338359,0.000176,0.323166,0.205043,0.049183,0.000209,0.669028,0.022290,0.917697,0.000288,0.425776,0.021120,0.143765,0.000290,0.318531,0.005524
695,0.033421,7.413021e-07,0.623481,0.087525,0.336841,0.000085,0.361575,0.303450,0.025244,0.000017,0.701215,0.039903,0.960143,0.000032,0.449578,0.013544,0.065772,0.000031,0.428750,0.009030
529,0.034013,1.121194e-06,0.435073,0.077092,0.340550,0.000035,0.757522,0.476117,0.025763,0.000021,0.702560,0.009958,0.957868,0.000029,0.464207,0.032293,0.065263,0.000032,0.401469,0.009009
1023,0.046963,1.161031e-05,0.735021,0.136568,0.340957,0.000064,0.469528,0.079204,0.045399,0.000235,0.720143,0.037797,0.921627,0.000315,0.443702,0.011680,0.109050,0.000266,0.327025,0.002996


## Logistic Regression Model

In [180]:
X_train = X_train.fillna(X_train.mean())
X_val = X_val.fillna(X_val.mean()) 
X_test = X_test.fillna(X_test.mean()) 

model,preds=LogisticRegressionModel(X_train.values,y_train.values,X_val.values,y_val.values)

Mean Square Error: 0.9824561403508771
Accuracy : 0.7149122807017544


  y = column_or_1d(y, warn=True)


In [181]:
# Make predictions
train_preds=model.predict(X_train)
print('train Accuracy :', metrics.accuracy_score(y_train, train_preds)) 

test_preds=model.predict(X_val)
print('val Accuracy :', metrics.accuracy_score(y_val, test_preds)) 

test_preds=model.predict(X_test)
print('test Accuracy :', metrics.accuracy_score(y_test, test_preds)) 

train Accuracy : 0.7266081871345029
val Accuracy : 0.7149122807017544
test Accuracy : 0.7192982456140351




## KNN classfier

In [182]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Create a KNN classifier with k=5
knn = KNeighborsClassifier(n_neighbors=5)

# Train the classifier
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_val)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.6666666666666666


  return self._fit(X, y)


### PCA

In [183]:
import numpy as np
from sklearn.decomposition import PCA
# create a PCA object with two components
pca = PCA(n_components=10)

# fit the PCA model to the data and transform the data
Xtrain_pca = pca.fit_transform(X_train)
Xtest_pca=pca.transform(X_test)
# print the explained variance ratio of the two principal components
print(pca.explained_variance_ratio_)

[0.76266755 0.0664058  0.05695934 0.03417072 0.02774405 0.01709985
 0.01628572 0.00743057 0.00613078 0.00297449]


In [184]:
model,preds=LogisticRegressionModel(Xtrain_pca,y_train,Xtest_pca,y_test)

Mean Square Error: 0.9780701754385965
Accuracy : 0.7192982456140351


  y = column_or_1d(y, warn=True)


In [185]:
#after applaying pca
train_preds=model.predict(Xtrain_pca)
print('train Accuracy :', metrics.accuracy_score(y_train, train_preds)) 
test_preds=model.predict(Xtest_pca)
print('test Accuracy :', metrics.accuracy_score(y_test, test_preds)) 

train Accuracy : 0.7251461988304093
test Accuracy : 0.7192982456140351


#Bouns PSD feature extraction

In [431]:
#cols_featuers = ["label"] + [f"col{i}" for i in range(7)]

In [432]:
#featuered_data=get_featuers_psd(pre_processed_data,2,cols_featuers)


In [433]:
#featuered_data.head()

In [434]:
#len(featuered_data)

# Label encoding

In [435]:
# #df=featuered_data
# encoded_Data=df

In [436]:
# df.head(5)

# Models

 ## data prepration for models

In [437]:
# X=encoded_Data.drop(["label"],axis=1)
# X=FeatureScalling(X)
# y=encoded_Data[["label"]]
# # convert data to numpy array
# #X_np=X.values
# #y_np=y.values
# #y_np=encoded_labels

In [438]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42,stratify=y)

In [81]:
# print("size of trining examples:  "+str(len(y_train)))
# print("size of dev examples:  "+str(len(y_test)))

In [82]:
# y_test

## USING ALL THE DATA

### Load all data

In [53]:
# # task 1
# import pandas as pd
# import glob
# import os
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder

# predicted_blinks = 0
# predicted_leftgaze = 0
# predicted_rightgaze = 0

# #sub_nums = [4,5,6,7,11,12,13,14,15,17,18,20,21,22,24]
# sub_nums = [4,5,7,11,12,15,17, 20,21,22,24]

# dataframes = []
# for sub_num in sub_nums:
#     # Define the folder path
#     if sub_num == 4 or sub_num ==24:
#         folder_path = 'data_analysis/data_BlinkGaze/sub' + str(sub_num) + 's1/blinkGazeStudy/EEG0_reformat'
#     else:
#         folder_path = 'data_analysis/data_BlinkGaze/sub' + str(sub_num) + 's0/blinkGazeStudy/EEG0_reformat'

#     # Use glob to find all CSV files in the folder
#     csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

#     # Initialize an empty list to hold the DataFrames

#     # Loop through the list of CSV files and read each one into a DataFrame
#     for file in csv_files:
#         df = pd.read_csv(file)
#         df["channel/polarity"] = "EOG0"
#         new_column_names = ["label"] + [f'Sample{i}' for i in range(251)]
#         df.columns = new_column_names
#         dataframes.append(df)

#     if sub_num == 4 or sub_num ==24:
#         folder_path = 'data_analysis/data_BlinkGaze/sub' + str(sub_num) + 's1/blinkGazeStudy/EEG1_reformat'
#     else:
#         folder_path = 'data_analysis/data_BlinkGaze/sub' + str(sub_num) + 's0/blinkGazeStudy/EEG1_reformat'
#     # Use glob to find all CSV files in the folder
#     csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

#     # Loop through the list of CSV files and read each one into a DataFrame
#     for file in csv_files:
#         df = pd.read_csv(file)
#         df["channel/polarity"] = "EEG0"
#         new_column_names = ["label"] + [f'Sample{i}' for i in range(251)]
#         df.columns = new_column_names
#         dataframes.append(df)

#     if sub_num == 4 or sub_num ==24:
#         folder_path = 'data_analysis/data_BlinkGaze/sub' + str(sub_num) + 's1/blinkGazeStudy/EOG0_reformat'
#     else:
#         folder_path = 'data_analysisdata_BlinkGaze/sub' + str(sub_num) + 's0/blinkGazeStudy/EOG0_reformat'
#     # Use glob to find all CSV files in the folder
#     csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

#     # Loop through the list of CSV files and read each one into a DataFrame
#     for file in csv_files:
#         df = pd.read_csv(file)
#         df["channel/polarity"] = "EEG1"
#         new_column_names = ["label"] + [f'Sample{i}' for i in range(251)]
#         df.columns = new_column_names
#         dataframes.append(df)

# # Concatenate all DataFrames into one big DataFrame
# all_dataframe = pd.concat(dataframes, ignore_index=True)


# data = all_dataframe

# # pre processing
# cols = ["label"] + [f"sample{i}" for i in range(50)]
# data= data.iloc[:, 0:251]
# pre_processed_data=pre_processing_Of_data_noDownsample(data, cols_noDownsample)

# # feature extraction
# cols_features = ["label"] + [f"col{i}" for i in range(20)]
# featuered_data=get_features(pre_processed_data,4,cols_features)

# # label encoding
# df=featuered_data
# encoded_Data=df

# #label enconding for y

# df = featuered_data

# le = LabelEncoder()

# df['encoded_label'] = le.fit_transform(df['label'])
# df = df.drop(["label"], axis=1)
# encoded_Data=df

# # preparing data
# X=encoded_Data.drop(["encoded_label"],axis=1)
# X=FeatureScalling(X)
# y=encoded_Data[["encoded_label"]]
# X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.4, random_state=40,stratify=y)
# X_val, X_test, y_val, y_test = train_test_split(X_dev, y_dev, test_size=0.5, random_state=40,stratify=y_dev)

# model, preds=random_forest(X_train,y_train,X_val,y_val)



In [186]:
# # task 2 predicting for wordBlink
# sub_nums = [4,5,7,11,12,15,17, 20,21,22,24]

# for sub_num in sub_nums:
#     predicted_blinks= 0 
#     predicted_leftgaze = 0
#     predicted_rightgaze=0 

#     if sub_num == 4 or sub_num ==24:
#         folder_path = 'data_BlinkGaze/sub' + str(sub_num) + 's1/' + str(sub_num) + '_data_wordBlink.csv'
#     else:
#         folder_path = 'data_BlinkGaze/sub' + str(sub_num) + 's0/' + str(sub_num) + '_data_wordBlink.csv'
    
#     df = pd.read_csv(folder_path)
   
#     # get input for model
#     X = pre_processed_data=pre_processing_Of_data_noDownsample(data, cols_noDownsample)
#     featuered_data=get_features(pre_processed_data,4,cols_features)
    
#     # Make predictions
#     preds=model.predict(X)
#     preds = preds.squeeze()
#     print(preds)
#     for pred in preds:
#         if pred == 0:
#             predicted_blinks += 1
#         elif pred == 1:
#             predicted_leftgaze += 1
#         else:
#             predicted_rightgaze += 1

#     print('subject ' + str(sub_num))

#     print('blinks: ' +  str(predicted_blinks))
#     print('leftgaze: ' + str(predicted_leftgaze))
#     print('rightgaze: ' + str(predicted_rightgaze))

#     print('----------------------------------------------')

In [None]:
# #pre_processed_data=pre_processing_Of_data(data, cols)
# pre_processed_data_noDownsample = pre_processing_Of_data_noDownsample(data, cols_noDownsample)