In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook


In [2]:
data_df = pd.read_csv('./data/NAPS_valence_arousal_2014.csv', sep=';')
data_df.head()

Unnamed: 0,ID,Category,Nr,V_H,Description,Valence,Arousal
0,Animals_001_h,Animals,1,h,Dead Stork,257,644
1,Animals_002_v,Animals,2,v,Lion,624,668
2,Animals_003_h,Animals,3,h,Snake,524,552
3,Animals_004_v,Animals,4,v,Wolf,450,702
4,Animals_005_h,Animals,5,h,Bat,531,582


In [3]:
data_df['Valence_float'] = data_df['Valence'].map(lambda x: float(x.replace(',','.')))
data_df['Arousal_float'] = data_df['Arousal'].map(lambda x: float(x.replace(',','.')))

data_df.describe()

Unnamed: 0,Nr,Valence_float,Arousal_float
count,1356.0,1356.0,1356.0
mean,144.870944,5.391947,5.101881
std,92.42842,1.628732,1.058333
min,1.0,1.33,2.04
25%,68.0,4.06,4.43
50%,136.0,5.66,4.98
75%,208.25,6.7725,5.82
max,372.0,8.54,8.05


In [4]:
def mapping(x, labels, series, percentile_width):
        for i in range(len(labels)):
            if x < series.quantile((i + 1) * percentile_width):
                return labels[i]

def divide_into(labels, series):
    range = series.max() - series.min()
    percentile_width = 1 / len(labels)
    
    return series.map(lambda x: mapping(x, labels, series, percentile_width))

data_df['Valence_class'] = divide_into(['low', 'medium', 'high'], data_df['Valence_float'])
data_df['Arousal_class'] = divide_into(['low', 'medium', 'high'], data_df['Arousal_float'])
data_df = data_df.filter(['ID', 'Valence_float', 'Arousal_float', 'Valence_class', 'Arousal_class'])

data_df.head(10)

Unnamed: 0,ID,Valence_float,Arousal_float,Valence_class,Arousal_class
0,Animals_001_h,2.57,6.44,low,high
1,Animals_002_v,6.24,6.68,medium,high
2,Animals_003_h,5.24,5.52,medium,medium
3,Animals_004_v,4.5,7.02,low,high
4,Animals_005_h,5.31,5.82,medium,high
5,Animals_006_v,5.13,6.23,medium,high
6,Animals_007_h,4.76,7.06,medium,high
7,Animals_008_v,2.63,6.8,low,high
8,Animals_009_v,5.79,5.61,medium,high
9,Animals_010_h,4.59,5.9,low,high


In [5]:
classification_data = data_df.filter(items=['ID', 'Valence_class', 'Arousal_class'])
classification_data.head()

Unnamed: 0,ID,Valence_class,Arousal_class
0,Animals_001_h,low,high
1,Animals_002_v,medium,high
2,Animals_003_h,medium,medium
3,Animals_004_v,low,high
4,Animals_005_h,medium,high


In [6]:
from os import listdir
import time
import datetime
import re
data_base_dir = './data/2018-afcai-spring/'

def read_filenames_regex(regex, dir):
    return list(filter(lambda filename: re.match(regex, filename), listdir(dir)))
    

    
def read_showed_images_file(dir_name):
    dir = data_base_dir + dir_name + '/'
    ts_filename = read_filenames_regex('.*timestamp.csv', dir)
    all_ts_file = pd.read_csv(dir + ts_filename[0], names=['filename', 'datetime'])
    without_trail = all_ts_file.loc[all_ts_file.iloc[:,0].str.contains(r'^((?!trail).)*$')]
    without_trail['image_id'] = without_trail['filename'].map(lambda x: re.search('[a-zA-Z]*_[0-9]*_[a-z]', x).group(0))
    without_trail['timestamp'] = without_trail['datetime'].map(lambda dt: int(datetime.datetime.strptime(dt, "%Y-%m-%d %H:%M:%S.%f").timestamp()*1000))
    images_df = without_trail.filter(['image_id', 'timestamp'])
    images_df['timestamp'] = images_df['timestamp'].map(lambda x: int(x/1000))
    images_df.set_index('timestamp', inplace=True)
    return fill_ts_indexes(images_df).sort_index()

def fill_ts_indexes(df):
    for i in range(min(df.index), max(df.index) + 10):
        if i in df.index:
            x = df.loc[i, 'image_id']
        else:
            df.loc[i] = [x]
    return df.sort_index()    

In [7]:
def read_bitalino_bpm(dir_name):
    dir = data_base_dir + dir_name + '/BITalino/'
    bpm_file = dir + read_filenames_regex('.*BPM.*', dir)[0]
    return pd.read_csv(bpm_file)

def read_bitalino_gsr(dir_name):
    dir = data_base_dir + dir_name + '/BITalino/'
    bpm_file = dir + read_filenames_regex('.*GSR.*', dir)[0]
    return pd.read_csv(bpm_file)

def read_person_data_from(dir_name, process_function):
    if dir_name[0] == 'B':
        gsr = process_function(read_bitalino_gsr(dir_name))
        bpm = process_function(read_bitalino_bpm(dir_name))
    showed_images = read_showed_images_file(dir_name)
    bpm = bpm.rename(columns={'value':'bpm'})
    gsr = gsr.rename(columns={'value':'gsr'})
    person_data = {'images': showed_images, 'bpm': bpm, 'gsr': gsr}
    person_data = drop_redundat_measurements(person_data)
    person_data = person_data['bpm'].join(person_data['gsr']).join(person_data['images'])
    return person_data.join(data_df.set_index('ID'), on='image_id')

    
def drop_redundat_measurements(person_data):
    person_data['bpm'].drop([ts for ts in person_data['bpm'].index.tolist() if ts not in person_data['images'].index.tolist()], inplace=True)
    person_data['gsr'].drop([ts for ts in person_data['gsr'].index.tolist() if ts not in person_data['images'].index.tolist()], inplace=True)
    return person_data

    
def mean_for(millis, ts_val_df):
    ts_val_df['timestamp'] = ts_val_df['timestamp'].map(lambda x: int(x/1000))
    ts_val_df = ts_val_df.groupby(['timestamp']).mean()
    ts_val_df['value'] = ts_val_df[' value'].map(lambda x: int(x))
    return ts_val_df.filter(['value'])
    
def one_sec_mean(ts_val_df):
    return mean_for(1000, ts_val_df)

identity = lambda x: x
    
    
person_data = read_person_data_from('B365', one_sec_mean)

  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [28]:
def normalize(df):
    return (df - df.mean()) / (df.max() - df.min())

# normalize(person_data['bpm']).plot()
normalize(person_data['gsr'].shift(periods=0)).plot()

normalize(person_data['Arousal_float']).plot()
# normalize(person_data['Arousal_float']).plot()

# example data plotted

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1ffb9e2ca20>

In [154]:
# basic learning
person_data['Arousal_low'] = person_data['Arousal_class'].map(lambda x: 1 if (x == 'low') else 0)
person_data['Arousal_medium'] = person_data['Arousal_class'].map(lambda x: 1 if (x == 'medium') else 0)
person_data['Arousal_high'] = person_data['Arousal_class'].map(lambda x: 1 if (x == 'high') else 0)

def process_frame(frame):
    vec = frame['bpm'].tolist()[:10] + frame['gsr'].tolist()[:10]
    return vec


learning_df = person_data.filter(['image_id', 'bpm', 'gsr', 'Arousal_class', 'Valence_class'])

to_predict = 'Valence_class'

X = []
y = []
for image in learning_df['image_id'].unique():
    image_df = learning_df.loc[learning_df['image_id'] == image]
    y.append(image_df.filter([to_predict]).iloc[1][to_predict])
    X.append(process_frame(image_df.filter(['bpm', 'gsr'])))
    
def classify(level):
    if level == 'low':
        return 0
    if level == 'medium':
        return 1
    else:
        return 2
    
y = list(map(classify, y))

In [225]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split, cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True)

logreg = linear_model.LogisticRegression(C=1e3)

logreg.fit(X_train, y_train)

logreg.score(X_test, y_test)

0.3333333333333333

In [15]:
bitalino_dirs = read_filenames_regex('(B).*', data_base_dir)
bitalino_data = list(map(lambda x: read_person_data_from(x, one_sec_mean) ,bitalino_dirs))

  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [276]:
bitalino_data

Unnamed: 0_level_0,image_id,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1520846027,Landscapes_097_v,507
1520846028,Landscapes_097_v,513
1520846029,Landscapes_097_v,506
1520846030,Landscapes_097_v,510
1520846031,Landscapes_097_v,511
1520846032,Landscapes_097_v,506
1520846033,Landscapes_097_v,509
1520846034,Landscapes_097_v,512
1520846035,Landscapes_097_v,507
1520846036,Landscapes_097_v,513
