# Peripheral nervous system responses to food stimuli: Analysis using data mining

# Data preprocessing

## Data extraction

In [1]:
import pandas as pd
data = pd.read_excel('PsychophysioDataset.xlsx',
              sheet_name='Psychophysio')


In [2]:
data.head()

Unnamed: 0,Subject,Stimulus,Latency,Rise-Time,Amplitude,Events,Pleasantness
0,1,1,2.980469,3.011719,0.197369,1.0,5
1,1,2,1.886719,6.46875,0.911881,2.0,4
2,1,3,2.085938,4.746094,0.273287,1.0,0
3,1,4,1.699219,4.515625,0.618691,-2.0,5
4,1,5,0.800781,3.4375,0.083031,0.0,3


## Data selection

In [3]:
# we keep only relevant column
df = data[['Subject', 'Latency', 'Rise-Time', 'Amplitude','Events', 'Pleasantness']].copy()

In [4]:
# we keep only relevant row (remove nan)

# The ratings are between 1 and 5.
#If a pleasantness rating is equal to 0 then the participant has not scored.

df = df.drop(list(df[df['Pleasantness']==0].index))

# Suppression of columns with NaN.

nan_cols = [i for i in df.columns if df[i].isnull().any()]
for c in nan_cols :
    df = df.drop(list(df[df[c].isnull()].index))


## Discretization

In [5]:
from sklearn.cluster import KMeans

K = 2 #number of wanted clusters

discrete_values = []

for subject in set(data['Subject']):
    ratings = df[df['Subject']==subject]['Pleasantness']
    
    #kmeans
    x = [[e] for e in ratings]
    kmeans = KMeans(n_clusters=K, random_state=0).fit(x)
    sol = kmeans.predict(x)

    #labeling
    order = [x for _,x in sorted(zip(x,sol))]
    order = [ii for n,ii in enumerate(order) if ii not in order[:n]]
    d = {order[0]:'unpleasant', order[1]:'pleasant'}
    discrete_values.append([d[x] for x in sol])

#add new discrete values to the data
df['Pleasantness_class'] = [item for sublist in discrete_values for item in sublist]

In [6]:
df.head()

Unnamed: 0,Subject,Latency,Rise-Time,Amplitude,Events,Pleasantness,Pleasantness_class
0,1,2.980469,3.011719,0.197369,1.0,5,pleasant
1,1,1.886719,6.46875,0.911881,2.0,4,unpleasant
3,1,1.699219,4.515625,0.618691,-2.0,5,pleasant
4,1,0.800781,3.4375,0.083031,0.0,3,unpleasant
5,1,0.0,0.0,0.0,0.0,6,pleasant


## Results

In [7]:
# Checking if the data is balanced
import collections
collections.Counter(df['Pleasantness_class'])

#Almost as much "pleasant" element as "unpleasant".

Counter({'pleasant': 1130, 'unpleasant': 1148})

In [8]:
#data selection
df = df[['Latency', 'Rise-Time', 'Amplitude','Events', 'Pleasantness_class']]

#save in file
df.to_csv('PsychophysioPreprocessed.csv', index=False, sep=',')

In [9]:
df.head()

Unnamed: 0,Latency,Rise-Time,Amplitude,Events,Pleasantness_class
0,2.980469,3.011719,0.197369,1.0,pleasant
1,1.886719,6.46875,0.911881,2.0,unpleasant
3,1.699219,4.515625,0.618691,-2.0,pleasant
4,0.800781,3.4375,0.083031,0.0,unpleasant
5,0.0,0.0,0.0,0.0,pleasant
