In [2]:
import random as rnd
import pandas as pd
import numpy as np


from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize

# First, import data and structure them appropriately

### Define the function to import the data, structure and statistically reduce them 

In [3]:
def arrays_frame(filenames, n=12):
    ''' Returns a list, to be appended to a DataFrame, 
        with entries vectors of length 2*n with n EAR values and n boolean values describing the blinking.
        
        INPUT:
            - filenames: an iterable containing the names if the .csv files to use.
                         The files must be made of two columns, one with ear values one with 0 or 1 for the blink stamps
            - n: the length of EAR values to use around the blink
            
        OUTPUT:
            - the pd.DataFrame with the arrays of length n+1
    '''
    
    data_list=[]      # appending values to a list and then appending the list to DataFrame is less expensive
    
    for filename in filenames:
        data=pd.read_csv(filename, header=None)
        
        l=data.shape[0]
        for i in range(n//2+1, l-n//2):
            tmp= data[i-n//2: i+n//2]
            tmp= tmp.values
            tmp= tmp.flatten()
            tmp=np.append(tmp[0::2],
                          data.iloc[i, 1])
            tmp= tmp.reshape(n+1)
            
            data_list.append(tmp)
    
    # STATISTICAL TREATMENT
    # Since we want to have the same number of succesful and not succesful rows, 
    # randomly delete rows with last value 0 till I have the same number of 0 and 1.
    # We also normalize the data
    
    data_frame= pd.DataFrame(data_list)
    data_frame.iloc[:, -1]= data_frame.iloc[:, -1].apply(int)
    
    n_blinks= data_frame.iloc[:,-1].sum()
    n_tot= data_frame.shape[0]
    
    to_keep= rnd.sample(set((data_frame.index[data_frame.iloc[:,-1]==0]).values),
                          n_blinks)
    
    mask= pd.Series((range(data_frame.shape[0]))).isin(to_keep) | data_frame.iloc[:, -1]==1
    
    data_frame= data_frame[mask]
    data_frame.iloc[:,:-1]= normalize(data_frame.iloc[:,:-1], copy=True)
    
    return data_frame

### Import data with the function above. Returns the DataFrame to train the models with

In [4]:
n=12
data_all=arrays_frame(['andrea1.csv',
                       'martina.csv',
                       'davide1.csv',
                       'nicolo3.csv',
                       'gaia.csv',
                       'altea.csv',
                       'nicolo1.csv',
                       'martina3.csv',
                       'martina4.csv',
                       'matteo1.csv',
                       'enrico.csv',
                       'umberto.csv',
                       'tosi.csv',
                       'nicolo4.csv',
                       'ale_bella.csv',
                       'maura_base2.csv',
                       'eddie_vedder.csv'
                      ],
                      n) 

print('Data shape: {}'.format(data_all.shape))
print('There are {} blinks and {} non-blinks in the frame'.
      format(data_all.iloc[:,-1].sum(),
             (data_all.iloc[:,-1]==0).sum()))
data_all.head()

Data shape: (722, 13)
There are 361 blinks and 361 non-blinks in the frame


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
3,0.350926,0.339526,0.336249,0.349676,0.338624,0.209653,0.209653,0.199516,0.193708,0.270915,0.271841,0.314706,1
55,0.323041,0.309529,0.297391,0.311573,0.305418,0.153124,0.16554,0.261665,0.261378,0.318798,0.327537,0.352825,1
77,0.351462,0.353965,0.342289,0.34307,0.353965,0.152897,0.152906,0.20324,0.198659,0.282941,0.282941,0.321546,1
93,0.280283,0.291296,0.280868,0.300499,0.293182,0.284459,0.285662,0.285107,0.292601,0.283904,0.283904,0.301366,0
107,0.318569,0.337807,0.328501,0.328501,0.338445,0.328338,0.167913,0.200931,0.223173,0.276548,0.262302,0.287345,0


# Now try some classifiers to test predictions

In [5]:
X= data_all.iloc[:, :-1]
y= data_all.iloc[:, -1]

### First cross validation

#### Bernoulli Naive-Bayes

In [6]:
modelB= BernoulliNB()

cv_scores = cross_val_score(modelB, X, y, cv=10)

print("***** Cross Validation Summary *****")
for v in range(0, len(cv_scores)):
    print("R2 score on fold-{} = [{}]".format(v+1, cv_scores[v]))
    
print('\nAverage R2 score: {}'.format(cv_scores.mean()))

***** Cross Validation Summary *****
R2 score on fold-1 = [0.5]
R2 score on fold-2 = [0.5]
R2 score on fold-3 = [0.5]
R2 score on fold-4 = [0.5]
R2 score on fold-5 = [0.5]
R2 score on fold-6 = [0.5]
R2 score on fold-7 = [0.5]
R2 score on fold-8 = [0.5]
R2 score on fold-9 = [0.5]
R2 score on fold-10 = [0.5]

Average R2 score: 0.5


#### Support Vector Machines

In [7]:
modelS= SVC()

cv_scores = cross_val_score(modelS, X, y, cv=10)

print("***** Cross Validation Summary *****")
for v in range(0, len(cv_scores)):
    print("R2 score on fold-{} = [{}]".format(v+1, cv_scores[v]))
    
print('\nAverage R2 score: {}'.format(cv_scores.mean()))

***** Cross Validation Summary *****
R2 score on fold-1 = [0.8243243243243243]
R2 score on fold-2 = [0.7916666666666666]
R2 score on fold-3 = [0.875]
R2 score on fold-4 = [0.9027777777777778]
R2 score on fold-5 = [0.75]
R2 score on fold-6 = [0.8055555555555556]
R2 score on fold-7 = [0.7222222222222222]
R2 score on fold-8 = [0.6666666666666666]
R2 score on fold-9 = [0.8055555555555556]
R2 score on fold-10 = [0.75]

Average R2 score: 0.789376876876877


#### Decision Tree

In [8]:
modelD= DecisionTreeClassifier()

cv_scores = cross_val_score(modelD, X, y, cv=10)

print("***** Cross Validation Summary *****")
for v in range(0, len(cv_scores)):
    print("R2 score on fold-{} = [{}]".format(v+1, cv_scores[v]))

print('\nAverage R2 score: {}'.format(cv_scores.mean()))

***** Cross Validation Summary *****
R2 score on fold-1 = [0.8108108108108109]
R2 score on fold-2 = [0.7222222222222222]
R2 score on fold-3 = [0.9027777777777778]
R2 score on fold-4 = [0.8611111111111112]
R2 score on fold-5 = [0.875]
R2 score on fold-6 = [0.7361111111111112]
R2 score on fold-7 = [0.7222222222222222]
R2 score on fold-8 = [0.8194444444444444]
R2 score on fold-9 = [0.75]
R2 score on fold-10 = [0.7083333333333334]

Average R2 score: 0.7908033033033033


#### Random Forest

In [9]:
modelF= RandomForestClassifier()

cv_scores = cross_val_score(modelF, X, y, cv=10)

print("***** Cross Validation Summary *****")
for v in range(0, len(cv_scores)):
    print("R2 score on fold-{} = [{}]".format(v+1, cv_scores[v]))

print('\nAverage R2 score: {}'.format(cv_scores.mean()))

***** Cross Validation Summary *****
R2 score on fold-1 = [0.8648648648648649]
R2 score on fold-2 = [0.75]
R2 score on fold-3 = [0.8888888888888888]
R2 score on fold-4 = [0.9444444444444444]
R2 score on fold-5 = [0.8611111111111112]
R2 score on fold-6 = [0.8333333333333334]
R2 score on fold-7 = [0.8611111111111112]
R2 score on fold-8 = [0.8194444444444444]
R2 score on fold-9 = [0.875]
R2 score on fold-10 = [0.8333333333333334]

Average R2 score: 0.8531531531531531


### Now import a data set to test and try all the classifiers above

In [10]:
test= arrays_frame(['andrea1.csv'], n)
test_values= test.iloc[:, :-1]
test_res= test.iloc[:, -1]

print('There are {} blinks'.format(test_res.sum()))

There are 10 blinks


In [11]:
_= modelB.fit(X,y)
_= modelS.fit(X,y)
_= modelD.fit(X,y)
_= modelF.fit(X,y)

predB= modelB.predict(test_values)
predS= modelS.predict(test_values)
predD= modelD.predict(test_values)
predF= modelF.predict(test_values)

## Now import some raw data to make actual predictions

In [12]:
def import_data(filenames, n=12):
    ''' Imports the data and arrange them to be used for the prediction
        
        INPUT:
            - filenames: an iterable containing the names if the .csv files to use, they have to be a double column
            - n: the length of the array of EAR values to use
            
        OUTPUT:
            - the pd.DataFrame with the arrays of length n
    '''
    
    data_list=[]      # appending values to a list and then appending the list to DataFrame is less expensive
    
    for filename in filenames:
        data=pd.read_csv(filename, header=None)
        data= data.iloc[:,1]
        
        l=data.shape[0]
        for i in range(n//2+1, l-n//2):
            tmp= data[i-n//2: i+n//2]
            tmp= tmp.values
            tmp= tmp.flatten()
            
            data_list.append(tmp)
    
    
    data_frame= pd.DataFrame(data_list)
    normalize(data_frame.iloc[:,:-1])
    
    return data_frame

### The number of predictions would be correct if we made a grouping of consecutive 1s.
### Let's make a function to do so 

In [13]:
def process_prediction(array_orig):
    array=array_orig.copy()
    i=0
    while i<len(array)-1:
        if array[i]==1:
            count=0
            j=i
            zero_count=0
            while j<len(array) and (array[j]==1 or (zero_count<=1 and array[j-1]==1)):
                count+=1
                j+=1
                if array[i]==0:
                    zero_count+=1
                
            if count>1:
                array[i+1:i+count]=0
            i=j
        elif array[i]==0:
            i+=1
    return array


pred= modelF.predict(test_values) 
print(pred.sum())
print('Processed prediction \n',
      process_prediction(pred),
      '\nReal prediction \n',
      pred)

11
Processed prediction 
 [1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0] 
Real prediction 
 [1 0 0 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 0 1]


## A function to plot the blinking frequency, if needed

In [14]:
def frequency(data, fps=28.5, N=60):
    ''' Plot the frequency of the blinks, computed over the last N seconds
        
        INPUT:
            - data: a pd.Series or np 1-dimensional array
            - fps: frames per second of the video, allowas to transfrom from frames to time
            - N: the number of previous frames to compute the frequency over. Makes the frequency more or less sensitive
        OUTPUT:
            - the list with the frequencies on any frame computed over the previous N frames
    '''
    
    
    freq=[]
    
    for i in range(N,len(data)):
        freq.append( data.iloc[i:i+N].mean() *60 *fps )
    
    return freq   

#d=pd.read_csv('eardata.csv').iloc[:, 1]

#_=plt.plot(frequency(d, 28.5, 200)  )

## Make function to make a frame from the ear values array

In [15]:
def arrange_raw(raw_data, n=12):
    '''
        INPUT: a pd.Series with only ear values
        OUTPUT: a pd.DataFrame to feed the classifier
    '''
    tmp= raw_data.values.tolist()
    
    array=[]

    for i in range(n,len(tmp)):
        array.append(tmp[i-n:i])
    
    data_frame=pd.DataFrame(array)
    normalize(data_frame.iloc[:,:-1])
    
    return array

### Try to predict

In [16]:
gaia= pd.read_csv('eddie_vedder.csv')

X=gaia.iloc[:,0]
y=gaia.iloc[:,1]



gaia_pred= process_prediction(modelS.predict(arrange_raw(X,n))) 

predicted_blinks=gaia_pred.sum()
real_blinks=y.sum()

print('Predicted {} blinks in the video'.format(predicted_blinks))
print(gaia_pred)

print('There are {} blink in the video'.format(real_blinks))
print(np.array(y))

print('Total number of frames: {}'.format(len(y)))

print('Error rate:{}'.format(abs(predicted_blinks-real_blinks)/len(y)))

Predicted 39 blinks in the video
[0 0 0 ..., 0 0 0]
There are 41 blink in the video
[0 0 0 ..., 0 0 0]
Total number of frames: 1654
Error rate:0.0012091898428053204


## Now try some video to be compared with the algorithmic approach

In [17]:
def verify_video(filename):
    
    tmp=pd.read_csv(filename, header=None)
    fps=1/(tmp.iloc[1,0])
    print(fps)
    ear_frame= arrange_raw(tmp.iloc[:,1])
    y=process_prediction(modelS.predict(ear_frame))

    
    print('Video length in seconds: {}'.format(len(y)/fps))
    print('The number of blinks predicted is {}'.format(y.sum()))
    print('They have been detected at the following times:')
    print((np.array(range(len(y)))[y==1])/fps)

In [18]:
verify_video('./comparison_videos/altea/eardata_altea_lie.csv')

30.0702963648
Video length in seconds: 124.80754943243971
The number of blinks predicted is 82
They have been detected at the following times:
[   2.09509076    2.49415566    3.82437202    5.45388705    5.88620737
    9.84360102   13.63471763   14.99818939   18.49000732   19.68720204
   20.21928858   22.5471672    22.81321047   25.67317564   27.43571231
   27.96779885   31.09380728   32.291002     34.48585898   36.21514024
   36.94675924   37.97767691   39.90649063   41.33647321   42.10134761
   46.02548586   47.18942517   48.71917397   49.05172806   52.01145945
   53.57446367   55.03770165   56.46768424   57.831156     58.86207367
   59.52718185   59.85973594   60.72437657   61.35622934   61.9548267
   62.52016865   63.28504305   64.11642827   66.31128526   66.64383935
   70.50146677   70.93378709   71.66540608   73.76049684   74.42560502
   76.58720659   79.91274748   81.54226251   83.63735327   84.03641818
   84.40222767   88.69217542   94.71140442   95.27674637   96.47394109
   97.