# Practicals for lecture 1.2

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vigji/python-cimec-2025/blob/main/practicals/Practicals_1.2.ipynb)

#### 1.1.3 

In [None]:
# Go back to the RT data. Assume that w consider outliers the RTs longer than 0.7 seconds. 
# Compute again the mean RTs after having excluded such outliers, but make sure you use only vector operations!

# (Hint: an easy way to exclude outliers without using loops is by setting nans in the matrix)


In [None]:
# Find the index of the subject with the shortest trial reaction time of the whole dataset 
# (not shortest average!)
# (Hint: you will need two operations...)



In [None]:
# Use argmax to find the index of the warmest hour in the (non-reshaped) temperature_array.
# Then, use the index over timestamps_array to read out the corresponding timestamp.
timestamps_array, temperatures_array = download_meteo_data()

In [None]:
# (Bonus: we did not do this in class!)
# We can use the np.argsort() function to produce the indexes array required to
# order an array in ascending or descending values.

# For example:
random_arr = np.array([0.1, 5, 3.4, 2.3])
ordering_idxs = np.argsort(random_arr)
random_arr[ordering_idxs]  # with this index, this is now ordered!

# Let's make a ranking of the 5 warmest hours of 2022! 
# Sort the (non-reshaped) temperature array using the indexes produced by np.argsort.
# so that the first elements are the highest temperatures.
# Then sort the imestamps array with the same indexes, and take the first 5.
#
# Double check you match the result that you have got in the exercises above!

#### 1.2.0 

In [2]:
import numpy as np
import requests
import json


def generate_RT_data(n_subjects=200, n_samples_per_subject=1000):
    """
    Generates Reaction Time data for a given number of subjects, each with their own distribution parameters.
    
    Parameters:
    n_subjects (int): Number of subjects
    n_samples_per_subject (int): Number of samples (RT times) per subject
    
    Returns:
    np.ndarray: A 2D array where each row represents the RT times for a subject
    """
    np.random.seed(0)  # For reproducibility
    shift = 0.500  # Shift of the distribution
    # Initialize an empty array to store the RT times for all subjects
    RT_data = np.empty((n_subjects, n_samples_per_subject))
    
    for i in range(n_subjects):
        # Assuming mu ranges from 90 to 110 and sigma from 10 to 20 for the subjects
        mu = np.random.uniform(0.090, 0.110)
        sigma = np.random.uniform(0.10, 0.20)
        RT_data[i] = np.random.normal(mu, sigma, n_samples_per_subject) + shift
    
    return RT_data


def download_meteo_data(start_date="2022-01-01", end_date="2022-12-31",
                        latitude="45.88204", longitude="11.03647",
                        data="temperature_2m"):
    """Download meteo historical data from open-meteo.com.
    """
    BASE_URL = "https://archive-api.open-meteo.com/v1/"
    query = f"archive?latitude={latitude}&longitude={longitude}&start_date={start_date}&end_date={end_date}&hourly={data}"

    r = requests.get(BASE_URL + query)
    json_dict = json.loads(r.text)
    
    if "hourly" not in json_dict.keys():
        print(json_dict)
        return None, None
    else:
        return (np.array(json_dict["hourly"][k]) for k in ["time", data])

In [30]:
# Find the index of the subject with the shortest trial reaction time of the whole dataset 
# (not shortest average!)
# (Hint: you will need two operations...)
# (You can read a second hint scrolling right in the cell:                                                                                               : you have to do one max and one argmax...)

rt_data = generate_RT_data(n_subjects=10)
rt_data.shape

np.argmin(np.min(rt_data, axis=1))

1

In [37]:
# Use argmax to find the index of the warmest hour in the (non-reshaped) temperature_array.
# Then, use the index over timestamps_array to read out the corresponding timestamp.
timestamps_array, temperatures_array = download_meteo_data()

warmest_index = np.argmax(temperatures_array)
timestamps_array[warmest_index]

'2022-07-22T13:00'

In [43]:
# We can use the np.argsort() function to produce the indexes array required to
# order an array in ascending or descending values.

# Let's make a ranking of the 5 warmest hours of 2022! 
# Sort the (non-reshaped) temperature array using the indexes produced by np.argsort.
# so that the first elements are the highest temperatures.
# Then sort the timestamps array with the same indexes, and take the first 5.
sorted_by_temp_indexes = np.argsort(temperatures_array)
timestamps_array[sorted_by_temp_indexes][-5:]

array(['2022-07-22T11:00', '2022-07-25T13:00', '2022-07-26T12:00',
       '2022-07-22T12:00', '2022-07-22T13:00'], dtype='<U16')

In [None]:
# Build a boolean selector to filter all temperatures above 10 and below 25 degrees:
selector = (temperatures_array > 10) & (temperatures_array < 25)

In [62]:
# (Bonus):
# Let's do the same, but only for the months between january and March, and only for hours between 08 and 18.

# To get the condition on month and hours, you will have to parse the timestamp string, or explore
# the timedate library for more elegant solutions!
from datetime import datetime

timestamps_list = [datetime.fromisoformat(t) for t in timestamps_array]

hours_array = np.array([timestamp.hour for timestamp in timestamps_list])
month_array = np.array([timestamp.month for timestamp in timestamps_list])

selector = (temperatures_array > 10) & (temperatures_array < 25) & \
            (hours_array > 8) & (hours_array < 18) & \
             (month_array < 3)
selector

array([False, False, False, ..., False, False, False])

In [44]:
# (Bonus) Take the array of integer numbers below. Use array boolean operations to filter out the numbers that 
# are greater than 5 AND less than 8, OR that are multiple of 7.
np.random.seed(42)
an_array = np.random.randint(0, 10, 100)

((an_array >  5) & (an_array < 8)) | (an_array % 7 == 0) 

## Introduction to `pandas`

#### 1.2.1 DataFrames

In [45]:
import pandas as pd

In [46]:
# Consider the following dataset with info about subject in an experiment:
np.random.seed(42)
n_subjects = 100
subjects_df = pd.DataFrame({
    'age': np.random.randint(20, 40, n_subjects),
    'weight': np.random.randint(50, 100, n_subjects),
    'height': np.random.randint(150, 200, n_subjects),
    'sex': np.random.choice(['M', 'F'], n_subjects),
    'handedness': np.random.choice(['R', 'L'], n_subjects),
    'group': np.random.choice(['control', 'patient'], n_subjects)})

subjects_df.head()

Unnamed: 0,age,weight,height,sex,handedness,group
0,26,84,175,M,L,patient
1,39,93,181,M,R,patient
2,34,89,155,F,L,patient
3,30,71,181,M,R,patient
4,27,76,153,F,L,control


In [47]:
# Select the first two rows of the dataframe:
subjects_df[:2]

Unnamed: 0,age,weight,height,sex,handedness,group
0,26,84,175,M,L,patient
1,39,93,181,M,R,patient


In [48]:
# Select the column of the dataframe containing the subjects weight using the name of the column:
subjects_df["weight"]

0     84
1     93
2     89
3     71
4     76
      ..
95    82
96    50
97    68
98    51
99    93
Name: weight, Length: 100, dtype: int64

In [51]:
# Select the `weight` column of the dataframe filtering only rows of subjects > 34 years old:
subjects_df.loc[subjects_df["age"] > 34, "weight"]

1     93
6     50
19    64
21    75
24    81
26    98
29    79
33    94
36    78
39    81
47    77
49    93
56    88
62    52
67    58
69    82
82    51
83    52
98    51
Name: weight, dtype: int64

In [55]:
# Count how many males and how many females above age 30 are left-handed or right handed.
# (Hint: use the same element-wise operators we were using for numpy arrays)

for sex in "M", "F":
    for handedness in "L", "R":
        boolean_selector = (subjects_df["sex"] == sex) & (subjects_df["handedness"] == handedness)
        count = sum(boolean_selector)
        print(f"For sex {sex} {count} subjects are of handedness {handedness}")
    

For sex M 27 subjects are of handedness L
For sex M 23 subjects are of handedness R
For sex F 29 subjects are of handedness L
For sex F 21 subjects are of handedness R


In [None]:
# Use .iloc to select one every two rows for the first 3 columns:
subjects_df.iloc[::2, :3]

In [54]:
# Redefine the dataset index to be "subject_n_sex" 
# (where n is progressive number of the subject and sex the sex of the subject)

subjects_df.index = [f"subject_{n}_{sex}" for n, sex in enumerate(subjects_df["sex"])]
subjects_df

Unnamed: 0,age,weight,height,sex,handedness,group
subject_0_M,26,84,175,M,L,patient
subject_1_M,39,93,181,M,R,patient
subject_2_F,34,89,155,F,L,patient
subject_3_M,30,71,181,M,R,patient
subject_4_F,27,76,153,F,L,control
...,...,...,...,...,...,...
subject_95_M,31,82,170,M,L,patient
subject_96_F,21,50,197,F,L,control
subject_97_F,20,68,169,F,R,patient
subject_98_M,35,51,157,M,R,control
