In [434]:
%pip install numpy
%pip install pandas plotly


import pandas as pd
from datetime import datetime

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [435]:
# Import necessary libraries
import pandas as pd
import plotly.express as px

# Load the CSV data assuming 'data.csv' is your filename
data = pd.read_csv('122.csv')

# Step 1: Drop the first row (index 0 since we count from 0)
data = data.drop(index=0).reset_index(drop=True)
data = data.drop(index=0).reset_index(drop=True)

# Task 1: Count the number of participants in different groups
group_count = data['Group'].value_counts().reset_index()
group_count.columns = ['Group', 'count']  # Rename columns for clarity

# Create a bar plot for the number of participants by group
fig = px.bar(group_count, x='Group', y='count', color='Group',
             title="Number of Participants by Group",
             labels={"Group": "Group", "count": "Number of Participants"},
             text='count')  # Add values on top of bars

# Update layout to display values on top of bars
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()


In [436]:


# Task 2: Count how many participants are associated with each chart type
chart_type_count = data['ChartType'].value_counts().reset_index()
chart_type_count.columns = ['ChartType', 'count']  # Rename columns for clarity

# Create a bar plot of the ChartType counts
fig = px.bar(chart_type_count, x='ChartType', y='count', color='ChartType',
             title="Number of Participants by Chart Type",
             labels={"ChartType": "Chart Type", "count": "Number of Participants"},
             text='count')  # Add values on top of bars

# Update layout to display values on top of bars
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()


In [437]:
# Task 3: Count the number of participants by Chart Type and Group
grouped_data = data.groupby(['ChartType', 'Group']).size().reset_index(name='count')

# Define the desired order for the Chart Types
desired_order = ['StackedArea', 'Waterfall', 'ButterflyChart', 'BubbleChart', 
                 'Histogram', 'BarChart', 'Heatmap', 'Sunburst']

# Ensure 'ChartType' in the DataFrame follows the desired order
grouped_data['ChartType'] = pd.Categorical(grouped_data['ChartType'], categories=desired_order, ordered=True)
grouped_data = grouped_data.sort_values('ChartType')

# Create a bar plot, grouping by ChartType and coloring by Group
fig = px.bar(grouped_data, x='ChartType', y='count', color='Group',
             title="Number of Participants by Chart Type and Group",
             labels={"ChartType": "Chart Type", "count": "Number of Participants", "Group": "Group"},
             barmode='group',  # 'group' makes bars side by side for each ChartType
             text='count')  # Add values on top of bars

# Update layout for better appearance of text on top of groups
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

In [438]:
# Calculate dominant VARK modality for each participant
vark_types = ['V', 'A', 'R', 'K']
vark_columns = [f'VARK {i}' for i in range(1, 17)]

# Function to determine the dominant VARK preference
def find_dominant_vark(row):
    vark_counts = {v: 0 for v in vark_types}
    for col in vark_columns:
        if col in row and pd.notna(row[col]):
            vark_values = str(row[col]).split(',')
            for v in vark_values:
                v = v.strip().upper()
                if v in vark_counts:
                    vark_counts[v] += 1
    max_count = max(vark_counts.values())
    dominant_varks = [v for v, count in vark_counts.items() if count == max_count]
    return dominant_varks[0] if dominant_varks else ''

# Apply the function to each row to compute dominant VARK preference
data['VARK'] = data.apply(find_dominant_vark, axis=1)

# Step 2: Visualization of VARK Learning Preferences

# Count the number of participants for each VARK learning preference
vark_count = data['VARK'].value_counts().reset_index()
vark_count.columns = ['VARK', 'count']  # Rename columns for clarity

# Create a bar plot of the VARK counts
fig = px.bar(vark_count, x='VARK', y='count', color='VARK',
             title="Number of Participants by VARK Learning Preference",
             labels={"VARK": "VARK Learning Preference", "count": "Number of Participants"},
             text='count')  # Add values on top of bars

# Update layout to display values on top of bars
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

In [439]:
import pandas as pd
import plotly.express as px



# Correctly parse the 'Age' column into numeric values
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

# Convert the 'Gender' representation from numeric to categorical
data['Gender'] = data['Gender'].replace({'1': 'Male', '2': 'Female'})

# Define new age bins and labels
age_bins = [-float('inf'), 18, 30, 45, float('inf')]
age_labels = ['<18', '19-30', '30-45', '>45']

# Create 'AgeRange' based on the new bins
data['AgeRange'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels)

# Group the data by 'AgeRange' and 'Gender', then count occurrences
grouped_data = data.groupby(['AgeRange', 'Gender']).size().reset_index(name='count')

# Create a bar plot with the updated age ranges
fig = px.bar(grouped_data, x='AgeRange', y='count', color='Gender',
             title="Number of Participants by Age Range and Gender",
             labels={"AgeRange": "Age Range", "count": "Number of Participants", "Gender": "Gender"},
             barmode='group')  # 'group' places bars for Male and Female side by side

# Show the plot
fig.show()





In [440]:
# Ensure the StartDate and EndDate columns are in datetime format
data['StartDate'] = pd.to_datetime(data['StartDate'])
data['EndDate'] = pd.to_datetime(data['EndDate'])

# Calculate the time difference in minutes
data['TimeSpent'] = (data['EndDate'] - data['StartDate']).dt.total_seconds() / 60  # Convert to minutes

# Group participants based on the time spent
def categorize_time(minutes):
    if minutes < 40:
        return '<40 min'
    elif 40 <= minutes <= 60:
        return '40-60 min'
    else:
        return '>60 min'

data['TimeCategory'] = data['TimeSpent'].apply(categorize_time)

# Now, we have a 'TimeCategory' column that groups participants by time spent
# To see the counts of each category:
time_category_counts = data['TimeCategory'].value_counts().reset_index()
time_category_counts.columns = ['TimeCategory', 'Count']

fig = px.bar(time_category_counts, x='TimeCategory', y='Count', 
             title="Time Spent by Participants", 
             labels={'TimeCategory': 'Time Category', 'Count': 'Number of Participants'},
             color='TimeCategory', color_discrete_sequence=px.colors.qualitative.Set1)

# Show the plot
fig.show()

In [441]:
group_1_data = data[data['Group'] == "1"]
group_2_data = data[data['Group'] == "2"]

def count_A_B_per_question(df, columns):
    # Create a dictionary to store counts for each question type
    counts = {col: {'Analogy': 0, 'Baseline': 0} for col in columns}
    
    for col in columns:
        counts[col]['Analogy'] = df[col].str.contains('A', na=False).sum()
        counts[col]['Baseline'] = df[col].str.contains('B', na=False).sum()
    
    return counts

# Columns for Group 1 (Understanding-AB, Effort-AB, Effectiveness-AB, Preference-AB)
group_1_columns = ['Understanding', 'MentalEffort', 'Effectiveness', 'Engagement']

# Count A/B for each question in Group 1
group_1_counts = count_A_B_per_question(group_1_data, group_1_columns)

# Prepare the data for plotting (convert counts into a DataFrame)
plot_data_group_1 = []

for question, counts in group_1_counts.items():
    plot_data_group_1.append({'Question': question, 'Response': 'Analogy', 'Count': counts['Analogy']})
    plot_data_group_1.append({'Question': question, 'Response': 'Baseline', 'Count': counts['Baseline']})

group_1_plot_df = pd.DataFrame(plot_data_group_1)

# Plot the bar chart for Group 1, showing Analogy vs Baseline for each question
fig_group_1 = px.bar(group_1_plot_df, x='Question', y='Count', color='Response', barmode='group',
                     title="Analogy vs Baseline Responses for Group 1",
                     labels={'Question': 'Question Type', 'Count': 'Number of Responses', 'Response': 'Response Type'},
                     color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'})

fig_group_1.update_layout(xaxis_title="Question Type", yaxis_title="Count", barmode='group')

# Show the plot for Group 1
fig_group_1.show()

In [442]:
# Columns for Group 2 (Understanding-BA, Effort-BA, Effectiveness-BA, Preference-BA)
group_2_columns = ['Understanding', 'MentalEffort', 'Effectiveness', 'Engagement']

# Count A/B for each question in Group 2
group_2_counts = count_A_B_per_question(group_2_data, group_2_columns)

# Prepare the data for plotting (convert counts into a DataFrame)
plot_data_group_2 = []

for question, counts in group_2_counts.items():
    plot_data_group_2.append({'Question': question, 'Response': 'Analogy', 'Count': counts['Analogy']})
    plot_data_group_2.append({'Question': question, 'Response': 'Baseline', 'Count': counts['Baseline']})

group_2_plot_df = pd.DataFrame(plot_data_group_2)

# Plot the bar chart for Group 2, showing Analogy vs Baseline for each question
fig_group_2 = px.bar(group_2_plot_df, x='Question', y='Count', color='Response', barmode='group',
                     title="Analogy vs Baseline Responses for Group 2",
                     labels={'Question': 'Question Type', 'Count': 'Number of Responses', 'Response': 'Response Type'},
                     color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'})

fig_group_2.update_layout(xaxis_title="Question Type", yaxis_title="Count", barmode='group')

# Show the plot for Group 2
fig_group_2.show()

In [443]:
# Stacked Bar chart for both groups combined
# Function to count 'A' and 'B' responses for each question
def count_A_B_per_question(df, columns):
    counts = {col: {'Analogy': 0, 'Baseline': 0} for col in columns}
    for col in columns:
        counts[col]['Analogy'] = df[col].str.contains('A', na=False).sum()
        counts[col]['Baseline'] = df[col].str.contains('B', na=False).sum()
    return counts

# Define columns for Group 1 and Group 2 (assuming the same here for simplicity)
columns = ['Understanding', 'MentalEffort', 'Effectiveness', 'Engagement']

# Count A/B for each question in Group 1 and Group 2
group_1_counts = count_A_B_per_question(data[data['Group'] == "1"], columns)
group_2_counts = count_A_B_per_question(data[data['Group'] == "2"], columns)

# Prepare combined data for plotting
plot_data_combined = []

def prepare_plot_data(group_counts, group_name):
    for question, counts in group_counts.items():
        plot_data_combined.append({'Question': question, 'Response': 'Analogy', 'Count': counts['Analogy'], 'Group': group_name})
        plot_data_combined.append({'Question': question, 'Response': 'Baseline', 'Count': counts['Baseline'], 'Group': group_name})

prepare_plot_data(group_1_counts, 'Group 1')
prepare_plot_data(group_2_counts, 'Group 2')

combined_plot_df = pd.DataFrame(plot_data_combined)

# Plot the horizontal stacked bar chart
fig = px.bar(combined_plot_df, y='Question', x='Count', color='Response', orientation='h', 
             title="Analogy vs Baseline Responses for Both Groups",
             labels={'Question': 'Question Type', 'Count': 'Number of Responses', 'Response': 'Response Type'},
             color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'})

# Update layout to make the chart horizontal stacked
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total descending'})

# Show the plot
fig.show()



In [444]:
import plotly.express as px

# Assuming 'data' is your DataFrame containing the ratings
# Ensure the ratings columns are numeric
data['Interpretation_1'] = pd.to_numeric(data['Interpretation_1'], errors='coerce')
data['Context_1'] = pd.to_numeric(data['Context_1'], errors='coerce')
data['DataRelationship_1'] = pd.to_numeric(data['DataRelationship_1'], errors='coerce')

# Define all possible ratings
all_ratings = range(1, 8)  # Ratings from 1 to 7 inclusive

# Calculate counts of each rating for each variable
interpretation_counts = data['Interpretation_1'].value_counts().sort_index().reindex(all_ratings, fill_value=0)
context_counts = data['Context_1'].value_counts().sort_index().reindex(all_ratings, fill_value=0)
data_relationship_counts = data['DataRelationship_1'].value_counts().sort_index().reindex(all_ratings, fill_value=0)

# Create a DataFrame with these counts
counts_df = pd.DataFrame({
    'Interpretation': interpretation_counts,
    'Context': context_counts,
    'Data Relationship': data_relationship_counts
}).transpose().reset_index().rename(columns={'index': 'Variable'})

# Reshape the DataFrame for plotting
long_df = counts_df.melt(id_vars=['Variable'], value_vars=[1, 2, 3, 4, 5, 6, 7],
                         var_name='Rating', value_name='Count')

# Ensure 'Rating' is treated as a categorical variable with the correct order
long_df['Rating'] = long_df['Rating'].astype(int).astype(str)
long_df['Rating'] = pd.Categorical(long_df['Rating'], categories=[str(i) for i in range(1, 8)], ordered=True)

# Define a color sequence with 7 distinct colors
color_sequence = px.colors.qualitative.Pastel  # Or any other color sequence with 7 colors

# Create the stacked bar chart
fig = px.bar(long_df, x='Variable', y='Count', color='Rating',
             title="Strongly Disagree (1) - Strongly Agree (7)",
             labels={'Variable': '', 'Count': 'Number of Ratings'},
             width=800, height=600,
             color_discrete_sequence=color_sequence)

fig.update_layout(barmode='stack')

# Show the figure
fig.show()


In [445]:
import pandas as pd
import plotly.express as px

# Assuming 'data' is your DataFrame already loaded with relevant data

# Step 1: Construct the dataset with the required columns
processed_data = []

for index, row in data.iterrows():
    participant_id = row['PROLIFIC_PID']
    group = row['Group']

    # Ensure numeric conversion with error handling
    comprehension_a = pd.to_numeric(row['Comprehension-A_12'], errors='coerce')
    understanding_a = pd.to_numeric(row['Understanding-A_12'], errors='coerce')
    confidence_a = pd.to_numeric(row['Confidence-A_12'], errors='coerce')
    comprehension_b = pd.to_numeric(row['Comprehension-B_12'], errors='coerce')
    understanding_b = pd.to_numeric(row['Understanding-B_12'], errors='coerce')
    confidence_b = pd.to_numeric(row['Confidence-B_12'], errors='coerce')

    # Analogy data (denoted with 'A')
    analogy_entry = {
        'ParticipantID': participant_id,
        'VisualizationTechnique': 'Analogy',
        'Order': 'first' if (group == '1') else 'second',
        'Comprehension': comprehension_a,
        'Understanding': understanding_a,
        'Confidence': confidence_a
    }
    processed_data.append(analogy_entry)

    # Baseline data (denoted with 'B')
    baseline_entry = {
        'ParticipantID': participant_id,
        'VisualizationTechnique': 'Baseline',
        'Order': 'first' if (group == '2') else 'second',
        'Comprehension': comprehension_b,
        'Understanding': understanding_b,
        'Confidence': confidence_b
    }
    processed_data.append(baseline_entry)

# Create a DataFrame
processed_df = pd.DataFrame(processed_data)

print(processed_df.head())

# Ensure all relevant columns are numeric
processed_df[['Comprehension', 'Understanding', 'Confidence']] = processed_df[['Comprehension', 'Understanding', 'Confidence']].apply(pd.to_numeric)

# Step 2: Calculate mean and standard deviation for each visualization technique
# Manually calculate mean and std for comprehension, understanding, and confidence per technique
mean_std_comprehension = processed_df.groupby('VisualizationTechnique')['Comprehension'].agg(['mean', 'std']).reset_index()
mean_std_understanding = processed_df.groupby('VisualizationTechnique')['Understanding'].agg(['mean', 'std']).reset_index()
mean_std_confidence = processed_df.groupby('VisualizationTechnique')['Confidence'].agg(['mean', 'std']).reset_index()

# Print the results
print("Mean and Standard Deviation for Comprehension by VisualizationTechnique:")
print(mean_std_comprehension)
print("\nMean and Standard Deviation for Understanding by VisualizationTechnique:")
print(mean_std_understanding)
print("\nMean and Standard Deviation for Confidence by VisualizationTechnique:")
print(mean_std_confidence)

# Print the results
print("Mean and Standard Deviation for Each Visualization Technique:")
# print(mean_std_output)

# Step 3: Create boxplots for each of the perceptions, side by side for Analogy and Baseline
attributes = ['Comprehension', 'Understanding', 'Confidence']
for attr in attributes:
    fig = px.box(processed_df, x='VisualizationTechnique', y=attr, color='VisualizationTechnique',
                 title=f'Boxplot for {attr}',
                 labels={attr: attr, 'VisualizationTechnique': 'Technique'},
                 color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'})
    fig.show()

              ParticipantID VisualizationTechnique   Order  Comprehension  \
0  60f47600a931759ef3d1a505                Analogy  second              6   
1  60f47600a931759ef3d1a505               Baseline   first              3   
2  6604674670aaf5b1fd5fa623                Analogy  second              4   
3  6604674670aaf5b1fd5fa623               Baseline   first              5   
4  66d70f8a63fd78797e0776e1                Analogy  second              5   

   Understanding  Confidence  
0              6           6  
1              5           4  
2              4           2  
3              5           5  
4              3           5  
Mean and Standard Deviation for Comprehension by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy  5.426230  1.184738
1               Baseline  5.106557  1.401342

Mean and Standard Deviation for Understanding by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy

In [446]:
import pandas as pd
import plotly.express as px

# Assuming 'data' is your DataFrame already loaded with relevant data
# Construct the dataset with the required columns
perception_data = []

for index, row in data.iterrows():
    participant_id = row['PROLIFIC_PID']
    group = row['Group']

    # Collecting perception responses, ensuring they are cast to numeric
    try:
        analogy_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Analogy',
            'Order': 'first' if (group == '1') else 'second',
            'Comprehension': float(row['Comprehension-A_12']),
            'Understanding': float(row['Understanding-A_12']),
            'Confidence': float(row['Confidence-A_12'])
        }
        baseline_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Baseline',
            'Order': 'first' if (group == '2') else 'second',
            'Comprehension': float(row['Comprehension-B_12']),
            'Understanding': float(row['Understanding-B_12']),
            'Confidence': float(row['Confidence-B_12'])
        }
        perception_data.extend([analogy_entry, baseline_entry])
    except ValueError:
        print(f"Non-numeric data detected for participant {participant_id}. Skipping this participant.")

# Create a DataFrame
perception_df = pd.DataFrame(perception_data)

# Ensure all relevant columns are numeric
perception_df[['Comprehension', 'Understanding', 'Confidence']] = perception_df[['Comprehension', 'Understanding', 'Confidence']].apply(pd.to_numeric, errors='coerce')

# Calculate mean and std for each perception factor by technique and order
mean_std_comprehension = perception_df.groupby(['VisualizationTechnique', 'Order'])['Comprehension'].agg(['mean', 'std']).reset_index()
mean_std_understanding = perception_df.groupby(['VisualizationTechnique', 'Order'])['Understanding'].agg(['mean', 'std']).reset_index()
mean_std_confidence = perception_df.groupby(['VisualizationTechnique', 'Order'])['Confidence'].agg(['mean', 'std']).reset_index()

# Print the results
print("Mean and Standard Deviation for Comprehension by Visualization Technique and Order:")
print(mean_std_comprehension)
print("\nMean and Standard Deviation for Understanding by Visualization Technique and Order:")
print(mean_std_understanding)
print("\nMean and Standard Deviation for Confidence by Visualization Technique and Order:")
print(mean_std_confidence)

# Create boxplots for each of the perception metrics, showing both order and technique
perception_attributes = ['Comprehension', 'Understanding', 'Confidence']
for attr in perception_attributes:
    fig = px.box(perception_df, x='Order', y=attr, color='VisualizationTechnique',
                 title=f'Boxplot for {attr} by Visualization Technique and Order',
                 labels={attr: attr, 'Order': 'Order', 'VisualizationTechnique': 'Technique'},
                 color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'},
                 category_orders={'Order': ['first', 'second']})
    fig.show()


Mean and Standard Deviation for Comprehension by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  5.612903  1.029958
1                Analogy  second  5.233333  1.306654
2               Baseline   first  4.883333  1.194502
3               Baseline  second  5.322581  1.555258

Mean and Standard Deviation for Understanding by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  5.354839  1.391977
1                Analogy  second  4.800000  1.527155
2               Baseline   first  4.850000  1.435919
3               Baseline  second  4.951613  1.430664

Mean and Standard Deviation for Confidence by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  5.419355  1.208554
1                Analogy  second  4.900000  1.633339
2               Baseline   first  4.833333  1.250988
3      

In [447]:
%pip install --upgrade numpy scipy
from scipy.stats import ttest_ind

Note: you may need to restart the kernel to use updated packages.


In [None]:

print(len(processed_df))  # Ensure the sample sizes are equal

# Extract data for Phase 1's standalone comparison
group1_baseline = processed_df[(processed_df['Order'] == 'first') & (processed_df['VisualizationTechnique'] == 'Baseline')]
group2_analogy = processed_df[(processed_df['Order'] == 'first') & (processed_df['VisualizationTechnique'] == 'Analogy')]

print(len(group1_baseline), len(group2_analogy))  # Ensure the sample sizes are equal

# phase 1
# Perform t-tests for each cognitive load component
for component in ['Comprehension', 'Understanding', 'Confidence']:
    t_stat, p_val = ttest_ind(group1_baseline[component], group2_analogy[component], equal_var=False)  # assume unequal variances
    print(f'{component} - t-statistic: {t_stat}, p-value: {p_val}')


244
60 62
Comprehension - t-statistic: -3.607909447196638, p-value: 0.00045639395469192575
Understanding - t-statistic: -1.9708235905885036, p-value: 0.051055679869729864
Confidence - t-statistic: -2.6302280839933116, p-value: 0.009656290994651595


In [449]:
# post exposure, phase 2
group1_analogy_after_baseline = processed_df[(processed_df['Order'] == 'second') & (processed_df['VisualizationTechnique'] == 'Analogy')]
group2_baseline_after_analogy = processed_df[(processed_df['Order'] == 'second') & (processed_df['VisualizationTechnique'] == 'Baseline')]


# Perform t-tests for each cognitive load component
for component in ['Comprehension', 'Understanding', 'Confidence']:
    t_stat, p_val = ttest_ind(group1_analogy_after_baseline[component], group2_baseline_after_analogy[component], equal_var=False)  # assume unequal variances
    print(f'{component} - t-statistic: {t_stat}, p-value: {p_val}')

Comprehension - t-statistic: -0.34359180138140577, p-value: 0.7317663022914945
Understanding - t-statistic: -0.5654876934261541, p-value: 0.572808056879488
Confidence - t-statistic: -0.5259722917800581, p-value: 0.5998955503874048


In [450]:
import pandas as pd
import plotly.express as px

# Assuming 'data' is your DataFrame already loaded with relevant data
# Construct the dataset with the required columns
cognitive_load_data = []

for index, row in data.iterrows():
    participant_id = row['PROLIFIC_PID']
    group = row['Group']

    # Collecting cognitive load responses, ensuring they are cast to numeric
    try:
        analogy_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Analogy',
            'Order': 'first' if (group == '1') else 'second',
            'MentalDemand': float(row['Mental Demand-A_12']),
            'TemporalDemand': float(row['Temporal Demand-A_12']),
            'Effort': float(row['Effort-A_12']),
            'Frustration': float(row['Frustration-A_12'])
        }
        baseline_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Baseline',
            'Order': 'first' if (group == '2') else 'second',
            'MentalDemand': float(row['Mental Demand-B_12']),
            'TemporalDemand': float(row['Temporal Demand-B_12']),
            'Effort': float(row['Effort-B_12']),
            'Frustration': float(row['Frustration-B_12'])
        }
        cognitive_load_data.extend([analogy_entry, baseline_entry])
    except ValueError:
        print(f"Non-numeric data detected for participant {participant_id}. Skipping this participant.")

# Create a DataFrame
cognitive_load_df = pd.DataFrame(cognitive_load_data)

# Ensure all relevant columns are numeric
cognitive_load_df[['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']] = cognitive_load_df[['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']].apply(pd.to_numeric, errors='coerce')

# Manually calculate mean and std for each cognitive load factor per technique
mean_std_mental_demand = cognitive_load_df.groupby('VisualizationTechnique')['MentalDemand'].agg(['mean', 'std']).reset_index()
mean_std_temporal_demand = cognitive_load_df.groupby('VisualizationTechnique')['TemporalDemand'].agg(['mean', 'std']).reset_index()
mean_std_effort = cognitive_load_df.groupby('VisualizationTechnique')['Effort'].agg(['mean', 'std']).reset_index()
mean_std_frustration = cognitive_load_df.groupby('VisualizationTechnique')['Frustration'].agg(['mean', 'std']).reset_index()

# Print the results
print("Mean and Standard Deviation for Mental Demand by VisualizationTechnique:")
print(mean_std_mental_demand)
print("\nMean and Standard Deviation for Temporal Demand by VisualizationTechnique:")
print(mean_std_temporal_demand)
print("\nMean and Standard Deviation for Effort by VisualizationTechnique:")
print(mean_std_effort)
print("\nMean and Standard Deviation for Frustration by VisualizationTechnique:")
print(mean_std_frustration)

# Step 3: Create boxplots for each of the cognitive load factors, side by side for Analogy and Baseline
cognitive_load_attributes = ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']
for attr in cognitive_load_attributes:
    fig = px.box(cognitive_load_df, x='VisualizationTechnique', y=attr, color='VisualizationTechnique',
                 title=f'Boxplot for {attr}',
                 labels={attr: attr, 'VisualizationTechnique': 'Technique'},
                 color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'})
    fig.show()


Mean and Standard Deviation for Mental Demand by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy  3.647541  1.710068
1               Baseline  4.204918  1.719785

Mean and Standard Deviation for Temporal Demand by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy  2.483607  1.248031
1               Baseline  2.655738  1.568125

Mean and Standard Deviation for Effort by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy  4.639344  1.646763
1               Baseline  4.991803  1.496530

Mean and Standard Deviation for Frustration by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy  2.639344  1.454918
1               Baseline  3.024590  1.746132


In [451]:
import pandas as pd

# Assume your data is in a DataFrame called 'df'
# Columns: 'ParticipantID', 'Group', 'Order', 'Technique', 'MentalDemand', 'TemporalDemand', 'Effort', 'Frustration'

# Calculate descriptive statistics for each phase and technique
cognitive_load_stats = cognitive_load_df.groupby(['Order', 'VisualizationTechnique'])[['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']].agg(['mean', 'std'])
print(cognitive_load_stats)


                              MentalDemand           TemporalDemand            \
                                      mean       std           mean       std   
Order  VisualizationTechnique                                                   
first  Analogy                    3.887097  1.765840       2.564516  1.236348   
       Baseline                   4.350000  1.783398       2.850000  1.715582   
second Analogy                    3.400000  1.628142       2.400000  1.264911   
       Baseline                   4.064516  1.658273       2.467742  1.399271   

                                 Effort           Frustration            
                                   mean       std        mean       std  
Order  VisualizationTechnique                                            
first  Analogy                 4.790323  1.610794    2.741935  1.447842  
       Baseline                5.316667  1.396019    3.233333  1.807392  
second Analogy                 4.483333  1.682328    2.533333 

In [452]:



from scipy.stats import ttest_ind




In [453]:

print(len(cognitive_load_df))  # Ensure the sample sizes are equal

# Extract data for Phase 1's standalone comparison
group1_baseline = cognitive_load_df[(cognitive_load_df['Order'] == 'first') & (cognitive_load_df['VisualizationTechnique'] == 'Baseline')]
group2_analogy = cognitive_load_df[(cognitive_load_df['Order'] == 'first') & (cognitive_load_df['VisualizationTechnique'] == 'Analogy')]

print(len(group1_baseline), len(group2_analogy))  # Ensure the sample sizes are equal

# Perform t-tests for each cognitive load component
for component in ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']:
    t_stat, p_val = ttest_ind(group1_baseline[component], group2_analogy[component], equal_var=False)  # assume unequal variances
    print(f'{component} - t-statistic: {t_stat}, p-value: {p_val}')


244
60 62
MentalDemand - t-statistic: 1.440244032651355, p-value: 0.15240708799324212
TemporalDemand - t-statistic: 1.051537665131091, p-value: 0.2953795406545281
Effort - t-statistic: 1.9305729029494425, p-value: 0.05592341528298032
Frustration - t-statistic: 1.6541083419756102, p-value: 0.10088289094680983


In [454]:
group1_analogy_after_baseline = cognitive_load_df[(cognitive_load_df['Order'] == 'second') & (cognitive_load_df['VisualizationTechnique'] == 'Analogy')]
group2_baseline_after_analogy = cognitive_load_df[(cognitive_load_df['Order'] == 'second') & (cognitive_load_df['VisualizationTechnique'] == 'Baseline')]

# Fill missing values
for component in ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']:
    group1_analogy_after_baseline[component].fillna(group1_analogy_after_baseline[component].mean(), inplace=True)
    group2_baseline_after_analogy[component].fillna(group2_baseline_after_analogy[component].mean(), inplace=True)

# Perform t-tests
for component in ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']:
    t_stat, p_val = ttest_ind(group1_analogy_after_baseline[component], group2_baseline_after_analogy[component], equal_var=False)
    print(f'{component} (Post-Exposure) - t-statistic: {t_stat}, p-value: {p_val}')


# Extract data for Phase 2's post-exposure comparison
# group1_analogy_after_baseline = cognitive_load_df[(cognitive_load_df['Order'] == 'second') & (cognitive_load_df['VisualizationTechnique'] == 'Analogy')]
# group2_baseline_after_analogy = cognitive_load_df[(cognitive_load_df['Order'] == 'second') & (cognitive_load_df['VisualizationTechnique'] == 'Baseline')]

# # Perform t-tests for each cognitive load component
# for component in ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']:
#     t_stat, p_val = ttest_ind(group1_analogy_after_baseline[component], group2_baseline_after_analogy[component], equal_var=False)
#     print(f'{component} (Post-Exposure) - t-statistic: {t_stat}, p-value: {p_val}')

MentalDemand (Post-Exposure) - t-statistic: -2.233323914111357, p-value: 0.027381568136487195
TemporalDemand (Post-Exposure) - t-statistic: -0.2806868958382696, p-value: 0.7794361244533046
Effort (Post-Exposure) - t-statistic: -0.665216253697379, p-value: 0.5072085947859702
Frustration (Post-Exposure) - t-statistic: -1.0157858920157654, p-value: 0.3117949869206018



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For exampl

In [455]:

# Group data by VisualizationTechnique without considering order
analogy_group = cognitive_load_df[cognitive_load_df['VisualizationTechnique'] == 'Analogy']
baseline_group = cognitive_load_df[cognitive_load_df['VisualizationTechnique'] == 'Baseline']


# Print counts to ensure sample sizes
print(f"Analogy Group Size: {len(analogy_group)}, Baseline Group Size: {len(baseline_group)}")

# Perform t-tests for each cognitive load component
cognitive_load_components = ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']

# Check group sizes
print(f"Analogy Group Size: {len(analogy_group)}, Baseline Group Size: {len(baseline_group)}")
if len(analogy_group) == 0 or len(baseline_group) == 0:
    raise ValueError("One of the groups is empty. Check your filtering criteria.")

# Fill missing values and ensure data is numeric
for component in cognitive_load_components:
    analogy_group[component] = pd.to_numeric(analogy_group[component], errors='coerce')
    baseline_group[component] = pd.to_numeric(baseline_group[component], errors='coerce')
    
    analogy_group[component].fillna(analogy_group[component].mean(), inplace=True)
    baseline_group[component].fillna(baseline_group[component].mean(), inplace=True)

# Perform t-tests
for component in cognitive_load_components:
    t_stat, p_val = ttest_ind(analogy_group[component], baseline_group[component], equal_var=False)
    print(f'{component} - t-statistic: {t_stat}, p-value: {p_val}')


Analogy Group Size: 122, Baseline Group Size: 122
Analogy Group Size: 122, Baseline Group Size: 122
MentalDemand - t-statistic: -2.538438800147396, p-value: 0.011762785589865883
TemporalDemand - t-statistic: -0.9486592978658789, p-value: 0.34378791212654447
Effort - t-statistic: -1.74953746543098, p-value: 0.08147728496507291
Frustration - t-statistic: -1.872193293390176, p-value: 0.062425220251500706




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inpl

In [456]:
import pandas as pd
import plotly.express as px

# Assuming 'data' is your DataFrame already loaded with relevant data
# Construct the dataset with the required columns
cognitive_load_data = []

for index, row in data.iterrows():
    participant_id = row['PROLIFIC_PID']
    group = row['Group']

    # Collecting cognitive load responses, ensuring they are cast to numeric
    try:
        analogy_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Analogy',
            'Order': 'first' if (group == '1') else 'second',
            'MentalDemand': float(row['Mental Demand-A_12']),
            'TemporalDemand': float(row['Temporal Demand-A_12']),
            'Effort': float(row['Effort-A_12']),
            'Frustration': float(row['Frustration-A_12'])
        }
        baseline_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Baseline',
            'Order': 'first' if (group == '2') else 'second',
            'MentalDemand': float(row['Mental Demand-B_12']),
            'TemporalDemand': float(row['Temporal Demand-B_12']),
            'Effort': float(row['Effort-B_12']),
            'Frustration': float(row['Frustration-B_12'])
        }
        cognitive_load_data.extend([analogy_entry, baseline_entry])
    except ValueError:
        print(f"Non-numeric data detected for participant {participant_id}. Skipping this participant.")

# Create a DataFrame
cognitive_load_df = pd.DataFrame(cognitive_load_data)

# Ensure all relevant columns are numeric
cognitive_load_df[['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']] = cognitive_load_df[['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']].apply(pd.to_numeric, errors='coerce')

# Manually calculate mean and std for each cognitive load factor per technique
mean_std_mental_demand = cognitive_load_df.groupby(['VisualizationTechnique', 'Order'])['MentalDemand'].agg(['mean', 'std']).reset_index()
mean_std_temporal_demand = cognitive_load_df.groupby(['VisualizationTechnique', 'Order'])['TemporalDemand'].agg(['mean', 'std']).reset_index()
mean_std_effort = cognitive_load_df.groupby(['VisualizationTechnique', 'Order'])['Effort'].agg(['mean', 'std']).reset_index()
mean_std_frustration = cognitive_load_df.groupby(['VisualizationTechnique', 'Order'])['Frustration'].agg(['mean', 'std']).reset_index()

# Print the results
print("Mean and Standard Deviation for Mental Demand by Visualization Technique and Order:")
print(mean_std_mental_demand)
print("\nMean and Standard Deviation for Temporal Demand by Visualization Technique and Order:")
print(mean_std_temporal_demand)
print("\nMean and Standard Deviation for Effort by Visualization Technique and Order:")
print(mean_std_effort)
print("\nMean and Standard Deviation for Frustration by Visualization Technique and Order:")
print(mean_std_frustration)

# Step 3: Create boxplots for each of the cognitive load factors, showing both order and technique
cognitive_load_attributes = ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']
for attr in cognitive_load_attributes:
    fig = px.box(cognitive_load_df, x='Order', y=attr, color='VisualizationTechnique',
                 title=f'Boxplot for {attr} by Visualization Technique and Order',
                 labels={attr: attr, 'Order': 'Order', 'VisualizationTechnique': 'Technique'},
                 color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'},
                 category_orders={'Order': ['first', 'second']})
    fig.show()


Mean and Standard Deviation for Mental Demand by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  3.887097  1.765840
1                Analogy  second  3.400000  1.628142
2               Baseline   first  4.350000  1.783398
3               Baseline  second  4.064516  1.658273

Mean and Standard Deviation for Temporal Demand by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  2.564516  1.236348
1                Analogy  second  2.400000  1.264911
2               Baseline   first  2.850000  1.715582
3               Baseline  second  2.467742  1.399271

Mean and Standard Deviation for Effort by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  4.790323  1.610794
1                Analogy  second  4.483333  1.682328
2               Baseline   first  5.316667  1.396019
3        