# Python Session 1

## We will practice cleaning some Food choice task data

We are going to generate data from 20 individuals to practice our skills. In the task, participants rate 50 foods for healthiness, tastiness and choice. We are simulating this data below.



In [3]:
import pandas as pd # importing python packages
import numpy as np
import random

In [4]:
blocks = ['health', 'taste', 'choice'] # to check what blocks is
blocks

['health', 'taste', 'choice']

In [5]:
# Define blocks and trial structure
blocks = ['health', 'taste', 'choice']
trials_per_block = 75
participants = range(1, 21) # stops at 20

# Generate 50 unique foods
base_foods = [
    'apple', 'banana', 'burger', 'carrot', 'donut', 'eggs', 'fries', 'grapes', 'ice cream', 'kale',
    'pizza', 'yogurt', 'spinach', 'steak', 'candy', 'popcorn', 'mango', 'nuts', 'cheese', 'chicken',
    'broccoli', 'chocolate', 'granola', 'lettuce', 'pasta', 'salmon', 'tofu', 'soda', 'rice', 'beans',
    'cucumber', 'peach', 'bacon', 'cereal', 'toast', 'avocado', 'beef', 'peanut butter', 'cake', 'milk',
    'watermelon', 'pear', 'turkey', 'onion rings', 'oatmeal', 'cranberries', 'syrup', 'waffles', 'cookie', 'shrimp'
]
assert len(base_foods) == 50

# Assign fat and sugar levels randomly
food_properties = {}
for food in base_foods:
    fat = random.choices(['high', 'low'], weights=[0.4, 0.6])[0] # randomly select (40%)high or (60%)low
    sugar = random.choices(['high', 'low'], weights=[0.5, 0.5])[0]
    food_properties[food] = {'fat': fat, 'sugar': sugar}

# Generate trials
all_trials = []

for participant in participants:
    for block in blocks:
        for trial_num in range(1, trials_per_block + 1):
            food = random.choice(base_foods)
            rt_missing = random.random() < 0.02  # 2% chance of missing RT
            reaction_time = None if rt_missing else round(random.uniform(0.5, 4.0), 2) # random reaction time between 0.5 and 4.0, rounded to 2 decimal places
            rating = None if reaction_time is None else random.randint(1, 10) # random selection of rating if they did not miss trial

            fat = food_properties[food]['fat'] # merging data with food
            sugar = food_properties[food]['sugar']

            trial = {
                'participant': participant,
                'block': block,
                'trial_number': trial_num,
                'food': food,
                'reaction_time': reaction_time,
                'rating': rating,
                'fat': fat,
                'sugar': sugar
            }
            all_trials.append(trial) # stacking rows on top of each other

# Create DataFrame
df = pd.DataFrame(all_trials) # converts rows into data frame

# Validate logic: rating is only missing if RT is missing
assert all(df[df['rating'].isna()]['reaction_time'].isna())

The data are stored in a dataframe object, which we have called df
To access items in the dataframe, we need to type "df"

In [6]:
#If we want to see the data, we can just type
df

Unnamed: 0,participant,block,trial_number,food,reaction_time,rating,fat,sugar
0,1,health,1,beans,,,low,low
1,1,health,2,eggs,2.93,3.0,low,high
2,1,health,3,pasta,0.53,8.0,low,low
3,1,health,4,chocolate,3.76,2.0,high,low
4,1,health,5,chocolate,3.13,3.0,high,low
...,...,...,...,...,...,...,...,...
4495,20,choice,71,eggs,1.51,6.0,low,high
4496,20,choice,72,cereal,2.64,1.0,low,high
4497,20,choice,73,toast,1.51,3.0,low,low
4498,20,choice,74,fries,2.34,6.0,low,low


In [7]:
# To see anything in df we will need to reference df first
df.columns

Index(['participant', 'block', 'trial_number', 'food', 'reaction_time',
       'rating', 'fat', 'sugar'],
      dtype='object')

In [8]:
df['block']

Unnamed: 0,block
0,health
1,health
2,health
3,health
4,health
...,...
4495,choice
4496,choice
4497,choice
4498,choice


In [9]:
# We can also look at the values of columns
# All of these will access the food column
df.food
df['food']
df.iloc[0,3]

'beans'

In [10]:
# Try here with RT

In [11]:
# To analyze this data, we will first need to remove any missing trials
# let's find the missing values
df.reaction_time[df.reaction_time.isna()==True]
df.reaction_time.dropna()

Unnamed: 0,reaction_time
1,2.93
2,0.53
3,3.76
4,3.13
5,1.00
...,...
4495,1.51
4496,2.64
4497,1.51
4498,2.34


In [12]:
len(df)

4500

In [13]:
df[df.reaction_time.isna()==False]

Unnamed: 0,participant,block,trial_number,food,reaction_time,rating,fat,sugar
1,1,health,2,eggs,2.93,3.0,low,high
2,1,health,3,pasta,0.53,8.0,low,low
3,1,health,4,chocolate,3.76,2.0,high,low
4,1,health,5,chocolate,3.13,3.0,high,low
5,1,health,6,candy,1.00,6.0,low,high
...,...,...,...,...,...,...,...,...
4495,20,choice,71,eggs,1.51,6.0,low,high
4496,20,choice,72,cereal,2.64,1.0,low,high
4497,20,choice,73,toast,1.51,3.0,low,low
4498,20,choice,74,fries,2.34,6.0,low,low


In [14]:
df.reaction_time[df.reaction_time > 3]

# What would we change to see RTs < 2 only?

Unnamed: 0,reaction_time
3,3.76
4,3.13
9,3.05
10,3.88
12,3.44
...,...
4479,3.09
4482,3.22
4487,3.83
4488,3.51


In [15]:
df.reaction_time[df.reaction_time < 2]


Unnamed: 0,reaction_time
2,0.53
5,1.00
6,0.84
7,1.94
8,1.13
...,...
4481,1.34
4492,1.62
4495,1.51
4497,1.51


In [16]:
# make a new data frame with no missing values
df1 = df[df.reaction_time.isna()==True]

In [17]:
# Now we want to perform some calculations on this data-set
# let's start by summarizing, for one person the health rating

# Filter for participant 1 and the 'health' block
participant_id = 1
health_block = df[(df['participant'] == participant_id) & (df['block'] == 'health')] # taking data frame for one participant, just health data for participant 1

# Remove missing ratings (i.e., where RT was missing)
valid_ratings = health_block['rating'].dropna()

# Calculate the average health rating
average_health_rating = valid_ratings.mean()

print(f"Participant {participant_id}'s average health rating: {average_health_rating:.2f}")


Participant 1's average health rating: 5.45


In [18]:
np.mean(valid_ratings)

np.float64(5.445945945945946)

In [19]:
health_block

Unnamed: 0,participant,block,trial_number,food,reaction_time,rating,fat,sugar
0,1,health,1,beans,,,low,low
1,1,health,2,eggs,2.93,3.0,low,high
2,1,health,3,pasta,0.53,8.0,low,low
3,1,health,4,chocolate,3.76,2.0,high,low
4,1,health,5,chocolate,3.13,3.0,high,low
...,...,...,...,...,...,...,...,...
70,1,health,71,donut,2.81,3.0,low,low
71,1,health,72,milk,1.66,6.0,low,low
72,1,health,73,popcorn,2.92,10.0,low,low
73,1,health,74,pasta,3.82,9.0,low,low


In [20]:
#Try for health only for low and high-fat
for participant_id in range(1, 21):
  for x in ['low', 'high']:
    high_block = df[(df['participant'] == participant_id) & (df['block'] == 'health') & (df['fat'] == x)]
    valid_ratings = high_block['rating'].dropna()
    average_health_rating = valid_ratings.mean()
    print(f"Participant {participant_id}'s average high rating: {average_health_rating:.2f}")

Participant 1's average high rating: 5.69
Participant 1's average high rating: 4.47
Participant 2's average high rating: 5.80
Participant 2's average high rating: 5.79
Participant 3's average high rating: 5.75
Participant 3's average high rating: 4.55
Participant 4's average high rating: 4.62
Participant 4's average high rating: 6.21
Participant 5's average high rating: 5.59
Participant 5's average high rating: 3.88
Participant 6's average high rating: 5.98
Participant 6's average high rating: 7.04
Participant 7's average high rating: 5.05
Participant 7's average high rating: 4.95
Participant 8's average high rating: 5.22
Participant 8's average high rating: 5.63
Participant 9's average high rating: 5.38
Participant 9's average high rating: 5.40
Participant 10's average high rating: 5.33
Participant 10's average high rating: 4.55
Participant 11's average high rating: 6.27
Participant 11's average high rating: 5.53
Participant 12's average high rating: 5.55
Participant 12's average high

In [21]:
#Now let's create a new dataframe and store each persons average RT and rating for high and low fat foods

# Group by participant, block, and fat level
summary_df = (
    df
    .dropna(subset=['rating', 'reaction_time'])  # Exclude trials with missing values
    .groupby(['participant', 'block', 'fat'])
    .agg(
        average_rating=('rating', 'mean'),
        average_reaction_time=('reaction_time', 'mean'),
        trial_count=('rating', 'count')  # Optional: to see how many valid trials per group
    )
    .reset_index()
)

print(summary_df.head()) # head- print first five rows


   participant   block   fat  average_rating  average_reaction_time  \
0            1  choice  high        6.200000               2.158800   
1            1  choice   low        5.312500               2.525833   
2            1  health  high        4.466667               2.012667   
3            1  health   low        5.694915               2.153559   
4            1   taste  high        5.476190               2.063333   

   trial_count  
0           25  
1           48  
2           15  
3           59  
4           21  


In [22]:
# Pivot to wide format - going from long format(six rows) to wide format(one row)
wide_df = summary_df.pivot_table(
    index='participant',
    columns=['block', 'fat'],
    values=['average_rating', 'average_reaction_time']
)

In [23]:
wide_df

Unnamed: 0_level_0,average_rating,average_rating,average_rating,average_rating,average_rating,average_rating,average_reaction_time,average_reaction_time,average_reaction_time,average_reaction_time,average_reaction_time,average_reaction_time
block,choice,choice,health,health,taste,taste,choice,choice,health,health,taste,taste
fat,high,low,high,low,high,low,high,low,high,low,high,low
participant,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
1,6.2,5.3125,4.466667,5.694915,5.47619,6.0,2.1588,2.525833,2.012667,2.153559,2.063333,2.349815
2,5.833333,5.185185,5.791667,5.795918,5.941176,5.685185,1.948889,2.173519,2.200833,2.114286,1.841176,2.339074
3,6.15,5.207547,4.545455,5.75,4.26087,5.173077,2.1075,2.114906,2.470455,2.376731,2.490435,2.263846
4,6.470588,5.333333,6.210526,4.625,5.764706,5.454545,2.108235,2.218947,2.503158,2.183214,2.597647,2.360909
5,6.409091,5.66,3.875,5.586207,4.25,5.296296,2.510455,2.174,2.49375,2.19069,2.5445,2.347778
6,5.8,5.692308,7.041667,5.979592,4.571429,5.310345,2.0565,2.148462,2.394583,2.036122,2.01,2.30431
7,5.0,5.314815,4.95,5.054545,6.12,5.255319,2.183333,2.296111,2.1605,2.140182,2.6192,2.240426
8,5.388889,6.017544,5.62963,5.222222,5.48,5.702128,2.362778,2.247895,2.437778,2.257111,2.1128,2.321277
9,5.863636,5.235294,5.4,5.381818,6.0,5.294118,2.347727,2.262549,2.455,2.142909,2.509545,2.487059
10,6.208333,5.913043,4.545455,5.326923,5.066667,6.517241,2.32375,2.155652,2.296364,2.071731,2.01,2.197586


In [24]:
wide_df.columns # too chaotic, need to condense

MultiIndex([(       'average_rating', 'choice', 'high'),
            (       'average_rating', 'choice',  'low'),
            (       'average_rating', 'health', 'high'),
            (       'average_rating', 'health',  'low'),
            (       'average_rating',  'taste', 'high'),
            (       'average_rating',  'taste',  'low'),
            ('average_reaction_time', 'choice', 'high'),
            ('average_reaction_time', 'choice',  'low'),
            ('average_reaction_time', 'health', 'high'),
            ('average_reaction_time', 'health',  'low'),
            ('average_reaction_time',  'taste', 'high'),
            ('average_reaction_time',  'taste',  'low')],
           names=[None, 'block', 'fat'])

In [25]:
# Step 3: Flatten column names
wide_df.columns = [f'{stat}_{block}_{fat}' for stat, block, fat in wide_df.columns]
wide_df = wide_df.reset_index()

print(wide_df.head())

   participant  average_rating_choice_high  average_rating_choice_low  \
0            1                    6.200000                   5.312500   
1            2                    5.833333                   5.185185   
2            3                    6.150000                   5.207547   
3            4                    6.470588                   5.333333   
4            5                    6.409091                   5.660000   

   average_rating_health_high  average_rating_health_low  \
0                    4.466667                   5.694915   
1                    5.791667                   5.795918   
2                    4.545455                   5.750000   
3                    6.210526                   4.625000   
4                    3.875000                   5.586207   

   average_rating_taste_high  average_rating_taste_low  \
0                   5.476190                  6.000000   
1                   5.941176                  5.685185   
2                   4.2608

In [26]:
wide_df.columns

Index(['participant', 'average_rating_choice_high',
       'average_rating_choice_low', 'average_rating_health_high',
       'average_rating_health_low', 'average_rating_taste_high',
       'average_rating_taste_low', 'average_reaction_time_choice_high',
       'average_reaction_time_choice_low', 'average_reaction_time_health_high',
       'average_reaction_time_health_low', 'average_reaction_time_taste_high',
       'average_reaction_time_taste_low'],
      dtype='object')

In [27]:
# Here try and simulate a different dataset - a monetary choice task where the participant
# selects between an immediate vs delayed reward. Compare the RT between when the participant
# chooses the immediate vs delayed option

In [28]:
# navigate to the directory
data=pd.read_csv("https://raw.githubusercontent.com/CaitlinLloyd/Psychology_Programming2025/refs/heads/main/Data/DelayDisc_example.csv")

In [29]:
data # left is 1 for choice

Unnamed: 0,onset,rt,choice,money_left,delay_left,money_right,delay_right,participant
0,17.009420,4.48,,23.21,131,10.99,51,1
1,28.013655,3.16,1.0,16.43,32,9.99,19,1
2,43.017407,4.14,1.0,38.44,33,32.02,12,1
3,56.021820,4.47,1.0,38.66,100,26.57,24,1
4,71.024792,3.63,1.0,29.54,142,27.76,6,1
...,...,...,...,...,...,...,...,...
115,344.112863,5.42,1.0,30.54,38,37.25,132,2
116,357.117812,4.18,2.0,23.07,82,29.27,140,2
117,368.121754,3.32,1.0,33.18,165,27.61,104,2
118,383.125799,6.00,1.0,6.15,51,14.76,177,2


In [31]:
data['delayed_opt']= "none"
data

Unnamed: 0,onset,rt,choice,money_left,delay_left,money_right,delay_right,participant,delayed_opt_chose,delayed_opt
0,17.009420,4.48,,23.21,131,10.99,51,1,none,none
1,28.013655,3.16,1.0,16.43,32,9.99,19,1,none,none
2,43.017407,4.14,1.0,38.44,33,32.02,12,1,none,none
3,56.021820,4.47,1.0,38.66,100,26.57,24,1,none,none
4,71.024792,3.63,1.0,29.54,142,27.76,6,1,none,none
...,...,...,...,...,...,...,...,...,...,...
115,344.112863,5.42,1.0,30.54,38,37.25,132,2,none,none
116,357.117812,4.18,2.0,23.07,82,29.27,140,2,none,none
117,368.121754,3.32,1.0,33.18,165,27.61,104,2,none,none
118,383.125799,6.00,1.0,6.15,51,14.76,177,2,none,none


In [32]:
# figure out whether left or right column is delayed (1 is left, 2 is right)
data['delayed_opt']= "none"
data.loc[data['delay_left'] < data['delay_right'],'delayed_opt'] =2
data.loc[data['delay_left'] > data['delay_right'],'delayed_opt'] =1

In [33]:
data['delayed_opt_chose']= "none"
data.loc[data['delayed_opt'] == data['choice'],'delayed_opt_chose'] =1
data.loc[data['delayed_opt'] != data['choice'],'delayed_opt_chose'] =0

In [34]:
data[(data['participant'] == 1) & (data['delayed_opt_chose'] == 0)]

Unnamed: 0,onset,rt,choice,money_left,delay_left,money_right,delay_right,participant,delayed_opt_chose,delayed_opt
0,17.00942,4.48,,23.21,131,10.99,51,1,0,1
9,134.04283,5.86,2.0,19.62,150,19.25,27,1,0,1
10,145.047487,4.71,1.0,12.26,37,13.64,75,1,0,2
12,173.055158,5.7,1.0,13.91,11,14.19,94,1,0,2
13,188.061036,3.27,2.0,13.72,66,13.51,9,1,0,1
15,218.071854,3.52,2.0,16.31,132,12.65,21,1,0,1
16,231.077068,4.48,1.0,21.56,16,27.53,154,1,0,2
20,283.094411,4.96,1.0,19.91,84,20.26,159,1,0,2
21,298.098435,5.22,1.0,10.25,42,10.4,178,1,0,2
22,309.101121,4.67,1.0,21.87,89,23.26,168,1,0,2


In [35]:
# Now summarize the RT for each person when they chose delayed vs chose sooner reward
for participant in range(1, 3):
  for x in [1, 0]:
    high_block = data[(data['participant'] == participant) & (data['delayed_opt_chose'] == x)]
    valid_ratings = high_block['rt'].dropna()
    average_RT = valid_ratings.mean()
    print(f"Participant {participant}'s average RT: {average_RT:.2f} for {x} trials")

Participant 1's average RT: 4.42 for 1 trials
Participant 1's average RT: 4.25 for 0 trials
Participant 2's average RT: 4.66 for 1 trials
Participant 2's average RT: 4.91 for 0 trials


In [36]:
## HOMEWORK
## Here calculate the average earnings per person and the number of times they chose delayed vs sooner

## Upload solution to Github

In [45]:
# Create a new column for amount chosen
data['amount_chosen'] = data.apply(
    lambda row: row['money_left'] if row['choice'] == 1 else (row['money_right'] if row['choice'] == 2 else None),
    axis=1
)
display(data.head())

Unnamed: 0,onset,rt,choice,money_left,delay_left,money_right,delay_right,participant,delayed_opt_chose,delayed_opt,amount_chosen
0,17.00942,4.48,,23.21,131,10.99,51,1,0,1,
1,28.013655,3.16,1.0,16.43,32,9.99,19,1,1,1,16.43
2,43.017407,4.14,1.0,38.44,33,32.02,12,1,1,1,38.44
3,56.02182,4.47,1.0,38.66,100,26.57,24,1,1,1,38.66
4,71.024792,3.63,1.0,29.54,142,27.76,6,1,1,1,29.54


In [46]:
for participant in range(1, 3):
    # Calculate average earnings
    average_earnings = data[data['participant'] == participant]['amount_chosen'].dropna().mean()

    # Count the number of times the more vs less delayed option was chosen
    more_delayed_count = data[(data['participant'] == participant) & (data['delayed_opt_chose'] == 1)].shape[0]
    less_delayed_count = data[(data['participant'] == participant) & (data['delayed_opt_chose'] == 0)].shape[0]

    print(f"Participant {participant}: Average earnings = {average_earnings:.2f}, Delayed option chosen = {more_delayed_count}, Sooner option chosen = {less_delayed_count}")

Participant 1: Average earnings = 23.82, Delayed option chosen = 35, Sooner option chosen = 25
Participant 2: Average earnings = 22.11, Delayed option chosen = 33, Sooner option chosen = 27


## Extra
## These are hard exercises - not homework, for extra practice

In [None]:
# Hard

# Here simulate your own Delay Discounting Task and calculate some average metrics

In [None]:
# Very hard
# One outcome of interest is the discount rate, k, which denotes extent to which someone discounts
# value of delayed rewards (higher values = less patient)

# Here you can use chatGPT to get the formula for k - see whether you can calculate for each person
# in your dataset