In [141]:
%pip install numpy
%pip install pandas plotly


import pandas as pd
from datetime import datetime

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [168]:
# Import necessary libraries
import pandas as pd
import plotly.express as px

# Load the CSV data assuming 'data.csv' is your filename
data = pd.read_csv('temp.csv')

# Step 1: Drop the first row (index 0 since we count from 0)
data = data.drop(index=0).reset_index(drop=True)
data = data.drop(index=0).reset_index(drop=True)

# Task 1: Count the number of participants in different groups
group_count = data['Group'].value_counts().reset_index()
group_count.columns = ['Group', 'count']  # Rename columns for clarity

# Create a bar plot for the number of participants by group
fig = px.bar(group_count, x='Group', y='count', color='Group',
             title="Number of Participants by Group",
             labels={"Group": "Group", "count": "Number of Participants"},
             text='count')  # Add values on top of bars

# Update layout to display values on top of bars
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()


In [169]:


# Task 2: Count how many participants are associated with each chart type
chart_type_count = data['ChartType'].value_counts().reset_index()
chart_type_count.columns = ['ChartType', 'count']  # Rename columns for clarity

# Create a bar plot of the ChartType counts
fig = px.bar(chart_type_count, x='ChartType', y='count', color='ChartType',
             title="Number of Participants by Chart Type",
             labels={"ChartType": "Chart Type", "count": "Number of Participants"},
             text='count')  # Add values on top of bars

# Update layout to display values on top of bars
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()


In [170]:
# Task 3: Count the number of participants by Chart Type and Group
grouped_data = data.groupby(['ChartType', 'Group']).size().reset_index(name='count')

# Define the desired order for the Chart Types
desired_order = ['StackedArea', 'Waterfall', 'ButterflyChart', 'BubbleChart', 
                 'Histogram', 'BarChart', 'Heatmap', 'Sunburst']

# Ensure 'ChartType' in the DataFrame follows the desired order
grouped_data['ChartType'] = pd.Categorical(grouped_data['ChartType'], categories=desired_order, ordered=True)
grouped_data = grouped_data.sort_values('ChartType')

# Create a bar plot, grouping by ChartType and coloring by Group
fig = px.bar(grouped_data, x='ChartType', y='count', color='Group',
             title="Number of Participants by Chart Type and Group",
             labels={"ChartType": "Chart Type", "count": "Number of Participants", "Group": "Group"},
             barmode='group',  # 'group' makes bars side by side for each ChartType
             text='count')  # Add values on top of bars

# Update layout for better appearance of text on top of groups
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

In [172]:
# Calculate dominant VARK modality for each participant
vark_types = ['V', 'A', 'R', 'K']
vark_columns = [f'VARK {i}' for i in range(1, 17)]

# Function to determine the dominant VARK preference
def find_dominant_vark(row):
    vark_counts = {v: 0 for v in vark_types}
    for col in vark_columns:
        if col in row and pd.notna(row[col]):
            vark_values = str(row[col]).split(',')
            for v in vark_values:
                v = v.strip().upper()
                if v in vark_counts:
                    vark_counts[v] += 1
    max_count = max(vark_counts.values())
    dominant_varks = [v for v, count in vark_counts.items() if count == max_count]
    return dominant_varks[0] if dominant_varks else ''

# Apply the function to each row to compute dominant VARK preference
data['VARK'] = data.apply(find_dominant_vark, axis=1)

# Step 2: Visualization of VARK Learning Preferences

# Count the number of participants for each VARK learning preference
vark_count = data['VARK'].value_counts().reset_index()
vark_count.columns = ['VARK', 'count']  # Rename columns for clarity

# Create a bar plot of the VARK counts
fig = px.bar(vark_count, x='VARK', y='count', color='VARK',
             title="Number of Participants by VARK Learning Preference",
             labels={"VARK": "VARK Learning Preference", "count": "Number of Participants"},
             text='count')  # Add values on top of bars

# Update layout to display values on top of bars
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

In [173]:
import pandas as pd
import plotly.express as px



# Correctly parse the 'Age' column into numeric values
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

# Convert the 'Gender' representation from numeric to categorical
data['Gender'] = data['Gender'].replace({'1': 'Male', '2': 'Female'})

# Define new age bins and labels
age_bins = [-float('inf'), 18, 30, 45, float('inf')]
age_labels = ['<18', '19-30', '30-45', '>45']

# Create 'AgeRange' based on the new bins
data['AgeRange'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels)

# Group the data by 'AgeRange' and 'Gender', then count occurrences
grouped_data = data.groupby(['AgeRange', 'Gender']).size().reset_index(name='count')

# Create a bar plot with the updated age ranges
fig = px.bar(grouped_data, x='AgeRange', y='count', color='Gender',
             title="Number of Participants by Age Range and Gender",
             labels={"AgeRange": "Age Range", "count": "Number of Participants", "Gender": "Gender"},
             barmode='group')  # 'group' places bars for Male and Female side by side

# Show the plot
fig.show()





In [174]:
# Ensure the StartDate and EndDate columns are in datetime format
data['StartDate'] = pd.to_datetime(data['StartDate'])
data['EndDate'] = pd.to_datetime(data['EndDate'])

# Calculate the time difference in minutes
data['TimeSpent'] = (data['EndDate'] - data['StartDate']).dt.total_seconds() / 60  # Convert to minutes

# Group participants based on the time spent
def categorize_time(minutes):
    if minutes < 40:
        return '<40 min'
    elif 40 <= minutes <= 60:
        return '40-60 min'
    else:
        return '>60 min'

data['TimeCategory'] = data['TimeSpent'].apply(categorize_time)

# Now, we have a 'TimeCategory' column that groups participants by time spent
# To see the counts of each category:
time_category_counts = data['TimeCategory'].value_counts().reset_index()
time_category_counts.columns = ['TimeCategory', 'Count']

fig = px.bar(time_category_counts, x='TimeCategory', y='Count', 
             title="Time Spent by Participants", 
             labels={'TimeCategory': 'Time Category', 'Count': 'Number of Participants'},
             color='TimeCategory', color_discrete_sequence=px.colors.qualitative.Set1)

# Show the plot
fig.show()

In [175]:
group_1_data = data[data['Group'] == "1"]
group_2_data = data[data['Group'] == "2"]

def count_A_B_per_question(df, columns):
    # Create a dictionary to store counts for each question type
    counts = {col: {'Analogy': 0, 'Baseline': 0} for col in columns}
    
    for col in columns:
        counts[col]['Analogy'] = df[col].str.contains('A', na=False).sum()
        counts[col]['Baseline'] = df[col].str.contains('B', na=False).sum()
    
    return counts

# Columns for Group 1 (Understanding-AB, Effort-AB, Effectiveness-AB, Preference-AB)
group_1_columns = ['Understanding', 'MentalEffort', 'Effectiveness', 'Engagement']

# Count A/B for each question in Group 1
group_1_counts = count_A_B_per_question(group_1_data, group_1_columns)

# Prepare the data for plotting (convert counts into a DataFrame)
plot_data_group_1 = []

for question, counts in group_1_counts.items():
    plot_data_group_1.append({'Question': question, 'Response': 'Analogy', 'Count': counts['Analogy']})
    plot_data_group_1.append({'Question': question, 'Response': 'Baseline', 'Count': counts['Baseline']})

group_1_plot_df = pd.DataFrame(plot_data_group_1)

# Plot the bar chart for Group 1, showing Analogy vs Baseline for each question
fig_group_1 = px.bar(group_1_plot_df, x='Question', y='Count', color='Response', barmode='group',
                     title="Analogy vs Baseline Responses for Group 1",
                     labels={'Question': 'Question Type', 'Count': 'Number of Responses', 'Response': 'Response Type'},
                     color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'})

fig_group_1.update_layout(xaxis_title="Question Type", yaxis_title="Count", barmode='group')

# Show the plot for Group 1
fig_group_1.show()

In [176]:
# Columns for Group 2 (Understanding-BA, Effort-BA, Effectiveness-BA, Preference-BA)
group_2_columns = ['Understanding', 'MentalEffort', 'Effectiveness', 'Engagement']

# Count A/B for each question in Group 2
group_2_counts = count_A_B_per_question(group_2_data, group_2_columns)

# Prepare the data for plotting (convert counts into a DataFrame)
plot_data_group_2 = []

for question, counts in group_2_counts.items():
    plot_data_group_2.append({'Question': question, 'Response': 'Analogy', 'Count': counts['Analogy']})
    plot_data_group_2.append({'Question': question, 'Response': 'Baseline', 'Count': counts['Baseline']})

group_2_plot_df = pd.DataFrame(plot_data_group_2)

# Plot the bar chart for Group 2, showing Analogy vs Baseline for each question
fig_group_2 = px.bar(group_2_plot_df, x='Question', y='Count', color='Response', barmode='group',
                     title="Analogy vs Baseline Responses for Group 2",
                     labels={'Question': 'Question Type', 'Count': 'Number of Responses', 'Response': 'Response Type'},
                     color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'})

fig_group_2.update_layout(xaxis_title="Question Type", yaxis_title="Count", barmode='group')

# Show the plot for Group 2
fig_group_2.show()

In [177]:
# Stacked Bar chart for both groups combined
# Function to count 'A' and 'B' responses for each question
def count_A_B_per_question(df, columns):
    counts = {col: {'Analogy': 0, 'Baseline': 0} for col in columns}
    for col in columns:
        counts[col]['Analogy'] = df[col].str.contains('A', na=False).sum()
        counts[col]['Baseline'] = df[col].str.contains('B', na=False).sum()
    return counts

# Define columns for Group 1 and Group 2 (assuming the same here for simplicity)
columns = ['Understanding', 'MentalEffort', 'Effectiveness', 'Engagement']

# Count A/B for each question in Group 1 and Group 2
group_1_counts = count_A_B_per_question(data[data['Group'] == "1"], columns)
group_2_counts = count_A_B_per_question(data[data['Group'] == "2"], columns)

# Prepare combined data for plotting
plot_data_combined = []

def prepare_plot_data(group_counts, group_name):
    for question, counts in group_counts.items():
        plot_data_combined.append({'Question': question, 'Response': 'Analogy', 'Count': counts['Analogy'], 'Group': group_name})
        plot_data_combined.append({'Question': question, 'Response': 'Baseline', 'Count': counts['Baseline'], 'Group': group_name})

prepare_plot_data(group_1_counts, 'Group 1')
prepare_plot_data(group_2_counts, 'Group 2')

combined_plot_df = pd.DataFrame(plot_data_combined)

# Plot the horizontal stacked bar chart
fig = px.bar(combined_plot_df, y='Question', x='Count', color='Response', orientation='h', 
             title="Analogy vs Baseline Responses for Both Groups",
             labels={'Question': 'Question Type', 'Count': 'Number of Responses', 'Response': 'Response Type'},
             color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'})

# Update layout to make the chart horizontal stacked
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total descending'})

# Show the plot
fig.show()



In [178]:
import plotly.express as px

# Assuming 'data' is your DataFrame containing the ratings
# Ensure the ratings columns are numeric
data['Interpretation_1'] = pd.to_numeric(data['Interpretation_1'], errors='coerce')
data['Context_1'] = pd.to_numeric(data['Context_1'], errors='coerce')
data['DataRelationship_1'] = pd.to_numeric(data['DataRelationship_1'], errors='coerce')

# Define all possible ratings
all_ratings = range(1, 8)  # Ratings from 1 to 7 inclusive

# Calculate counts of each rating for each variable
interpretation_counts = data['Interpretation_1'].value_counts().sort_index().reindex(all_ratings, fill_value=0)
context_counts = data['Context_1'].value_counts().sort_index().reindex(all_ratings, fill_value=0)
data_relationship_counts = data['DataRelationship_1'].value_counts().sort_index().reindex(all_ratings, fill_value=0)

# Create a DataFrame with these counts
counts_df = pd.DataFrame({
    'Interpretation': interpretation_counts,
    'Context': context_counts,
    'Data Relationship': data_relationship_counts
}).transpose().reset_index().rename(columns={'index': 'Variable'})

# Reshape the DataFrame for plotting
long_df = counts_df.melt(id_vars=['Variable'], value_vars=[1, 2, 3, 4, 5, 6, 7],
                         var_name='Rating', value_name='Count')

# Ensure 'Rating' is treated as a categorical variable with the correct order
long_df['Rating'] = long_df['Rating'].astype(int).astype(str)
long_df['Rating'] = pd.Categorical(long_df['Rating'], categories=[str(i) for i in range(1, 8)], ordered=True)

# Define a color sequence with 7 distinct colors
color_sequence = px.colors.qualitative.Pastel  # Or any other color sequence with 7 colors

# Create the stacked bar chart
fig = px.bar(long_df, x='Variable', y='Count', color='Rating',
             title="Strongly Disagree (1) - Strongly Agree (7)",
             labels={'Variable': '', 'Count': 'Number of Ratings'},
             width=800, height=600,
             color_discrete_sequence=color_sequence)

fig.update_layout(barmode='stack')

# Show the figure
fig.show()


In [179]:
import pandas as pd
import plotly.express as px

# Assuming 'data' is your DataFrame already loaded with relevant data

# Step 1: Construct the dataset with the required columns
processed_data = []

for index, row in data.iterrows():
    participant_id = row['PROLIFIC_PID']
    group = row['Group']

    # Ensure numeric conversion with error handling
    comprehension_a = pd.to_numeric(row['Comprehension-A_12'], errors='coerce')
    understanding_a = pd.to_numeric(row['Understanding-A_12'], errors='coerce')
    confidence_a = pd.to_numeric(row['Confidence-A_12'], errors='coerce')
    comprehension_b = pd.to_numeric(row['Comprehension-B_12'], errors='coerce')
    understanding_b = pd.to_numeric(row['Understanding-B_12'], errors='coerce')
    confidence_b = pd.to_numeric(row['Confidence-B_12'], errors='coerce')

    # Analogy data (denoted with 'A')
    analogy_entry = {
        'ParticipantID': participant_id,
        'VisualizationTechnique': 'Analogy',
        'Order': 'first' if (group == '1') else 'second',
        'Comprehension': comprehension_a,
        'Understanding': understanding_a,
        'Confidence': confidence_a
    }
    processed_data.append(analogy_entry)

    # Baseline data (denoted with 'B')
    baseline_entry = {
        'ParticipantID': participant_id,
        'VisualizationTechnique': 'Baseline',
        'Order': 'first' if (group == '2') else 'second',
        'Comprehension': comprehension_b,
        'Understanding': understanding_b,
        'Confidence': confidence_b
    }
    processed_data.append(baseline_entry)

# Create a DataFrame
processed_df = pd.DataFrame(processed_data)

print(processed_df.head())

# Ensure all relevant columns are numeric
processed_df[['Comprehension', 'Understanding', 'Confidence']] = processed_df[['Comprehension', 'Understanding', 'Confidence']].apply(pd.to_numeric)

# Step 2: Calculate mean and standard deviation for each visualization technique
# Manually calculate mean and std for comprehension, understanding, and confidence per technique
mean_std_comprehension = processed_df.groupby('VisualizationTechnique')['Comprehension'].agg(['mean', 'std']).reset_index()
mean_std_understanding = processed_df.groupby('VisualizationTechnique')['Understanding'].agg(['mean', 'std']).reset_index()
mean_std_confidence = processed_df.groupby('VisualizationTechnique')['Confidence'].agg(['mean', 'std']).reset_index()

# Print the results
print("Mean and Standard Deviation for Comprehension by VisualizationTechnique:")
print(mean_std_comprehension)
print("\nMean and Standard Deviation for Understanding by VisualizationTechnique:")
print(mean_std_understanding)
print("\nMean and Standard Deviation for Confidence by VisualizationTechnique:")
print(mean_std_confidence)

# Print the results
print("Mean and Standard Deviation for Each Visualization Technique:")
# print(mean_std_output)

# Step 3: Create boxplots for each of the perceptions, side by side for Analogy and Baseline
attributes = ['Comprehension', 'Understanding', 'Confidence']
for attr in attributes:
    fig = px.box(processed_df, x='VisualizationTechnique', y=attr, color='VisualizationTechnique',
                 title=f'Boxplot for {attr}',
                 labels={attr: attr, 'VisualizationTechnique': 'Technique'},
                 color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'})
    fig.show()

              ParticipantID VisualizationTechnique   Order  Comprehension  \
0  60f47600a931759ef3d1a505                Analogy  second              6   
1  60f47600a931759ef3d1a505               Baseline   first              4   
2  672a528ae534c28289a223d6                Analogy   first              5   
3  672a528ae534c28289a223d6               Baseline  second              5   
4  647df23a62de26114fdcb29c                Analogy   first              6   

   Understanding  Confidence  
0              6           6  
1              6           4  
2              6           4  
3              4           3  
4              6           6  
Mean and Standard Deviation for Comprehension by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy  5.101449  1.456678
1               Baseline  5.086957  1.291820

Mean and Standard Deviation for Understanding by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy

In [180]:
import pandas as pd
import plotly.express as px

# Assuming 'data' is your DataFrame already loaded with relevant data
# Construct the dataset with the required columns
perception_data = []

for index, row in data.iterrows():
    participant_id = row['PROLIFIC_PID']
    group = row['Group']

    # Collecting perception responses, ensuring they are cast to numeric
    try:
        analogy_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Analogy',
            'Order': 'first' if (group == '1') else 'second',
            'Comprehension': float(row['Comprehension-A_12']),
            'Understanding': float(row['Understanding-A_12']),
            'Confidence': float(row['Confidence-A_12'])
        }
        baseline_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Baseline',
            'Order': 'first' if (group == '2') else 'second',
            'Comprehension': float(row['Comprehension-B_12']),
            'Understanding': float(row['Understanding-B_12']),
            'Confidence': float(row['Confidence-B_12'])
        }
        perception_data.extend([analogy_entry, baseline_entry])
    except ValueError:
        print(f"Non-numeric data detected for participant {participant_id}. Skipping this participant.")

# Create a DataFrame
perception_df = pd.DataFrame(perception_data)

# Ensure all relevant columns are numeric
perception_df[['Comprehension', 'Understanding', 'Confidence']] = perception_df[['Comprehension', 'Understanding', 'Confidence']].apply(pd.to_numeric, errors='coerce')

# Calculate mean and std for each perception factor by technique and order
mean_std_comprehension = perception_df.groupby(['VisualizationTechnique', 'Order'])['Comprehension'].agg(['mean', 'std']).reset_index()
mean_std_understanding = perception_df.groupby(['VisualizationTechnique', 'Order'])['Understanding'].agg(['mean', 'std']).reset_index()
mean_std_confidence = perception_df.groupby(['VisualizationTechnique', 'Order'])['Confidence'].agg(['mean', 'std']).reset_index()

# Print the results
print("Mean and Standard Deviation for Comprehension by Visualization Technique and Order:")
print(mean_std_comprehension)
print("\nMean and Standard Deviation for Understanding by Visualization Technique and Order:")
print(mean_std_understanding)
print("\nMean and Standard Deviation for Confidence by Visualization Technique and Order:")
print(mean_std_confidence)

# Create boxplots for each of the perception metrics, showing both order and technique
perception_attributes = ['Comprehension', 'Understanding', 'Confidence']
for attr in perception_attributes:
    fig = px.box(perception_df, x='Order', y=attr, color='VisualizationTechnique',
                 title=f'Boxplot for {attr} by Visualization Technique and Order',
                 labels={attr: attr, 'Order': 'Order', 'VisualizationTechnique': 'Technique'},
                 color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'},
                 category_orders={'Order': ['first', 'second']})
    fig.show()


Mean and Standard Deviation for Comprehension by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  5.000000  1.541104
1                Analogy  second  5.194444  1.390158
2               Baseline   first  5.027778  1.298045
3               Baseline  second  5.151515  1.301951

Mean and Standard Deviation for Understanding by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  4.787879  1.452532
1                Analogy  second  4.777778  1.532712
2               Baseline   first  4.750000  1.556094
3               Baseline  second  4.787879  1.473889

Mean and Standard Deviation for Confidence by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  4.909091  1.331438
1                Analogy  second  4.805556  1.653040
2               Baseline   first  4.805556  1.237958
3      

In [181]:
%pip install --upgrade numpy scipy
from scipy.stats import ttest_ind

Note: you may need to restart the kernel to use updated packages.


In [182]:

print(len(processed_df))  # Ensure the sample sizes are equal

# Extract data for Phase 1's standalone comparison
group1_baseline = processed_df[(processed_df['Order'] == 'first') & (processed_df['VisualizationTechnique'] == 'Baseline')]
group2_analogy = processed_df[(processed_df['Order'] == 'first') & (processed_df['VisualizationTechnique'] == 'Analogy')]

print(len(group1_baseline), len(group2_analogy))  # Ensure the sample sizes are equal

# Perform t-tests for each cognitive load component
for component in ['Comprehension', 'Understanding', 'Confidence']:
    t_stat, p_val = ttest_ind(group1_baseline[component], group2_analogy[component], equal_var=False)  # assume unequal variances
    print(f'{component} - t-statistic: {t_stat}, p-value: {p_val}')


138
36 33
Comprehension - t-statistic: 0.08060065080787171, p-value: 0.9360156549476752
Understanding - t-statistic: -0.10457662104726023, p-value: 0.9170243174139918
Confidence - t-statistic: -0.33365610907797966, p-value: 0.7397079422535164


In [183]:
# Extract data for Phase 1's standalone comparison
group1_analogy_after_baseline = processed_df[(processed_df['Order'] == 'second') & (processed_df['VisualizationTechnique'] == 'Analogy')]
group2_baseline_after_analogy = processed_df[(processed_df['Order'] == 'second') & (processed_df['VisualizationTechnique'] == 'Baseline')]


# Perform t-tests for each cognitive load component
for component in ['Comprehension', 'Understanding', 'Confidence']:
    t_stat, p_val = ttest_ind(group1_analogy_after_baseline[component], group2_baseline_after_analogy[component], equal_var=False)  # assume unequal variances
    print(f'{component} - t-statistic: {t_stat}, p-value: {p_val}')

Comprehension - t-statistic: 0.1324526613797967, p-value: 0.8950234535534981
Understanding - t-statistic: -0.027899034743725022, p-value: 0.9778258134519484
Confidence - t-statistic: -0.6551684666446789, p-value: 0.5146066726434532


In [184]:
import pandas as pd
import plotly.express as px

# Assuming 'data' is your DataFrame already loaded with relevant data
# Construct the dataset with the required columns
cognitive_load_data = []

for index, row in data.iterrows():
    participant_id = row['PROLIFIC_PID']
    group = row['Group']

    # Collecting cognitive load responses, ensuring they are cast to numeric
    try:
        analogy_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Analogy',
            'Order': 'first' if (group == '1') else 'second',
            'MentalDemand': float(row['Mental Demand-A_12']),
            'TemporalDemand': float(row['Temporal Demand-A_12']),
            'Effort': float(row['Effort-A_12']),
            'Frustration': float(row['Frustration-A_12'])
        }
        baseline_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Baseline',
            'Order': 'first' if (group == '2') else 'second',
            'MentalDemand': float(row['Mental Demand-B_12']),
            'TemporalDemand': float(row['Temporal Demand-B_12']),
            'Effort': float(row['Effort-B_12']),
            'Frustration': float(row['Frustration-B_12'])
        }
        cognitive_load_data.extend([analogy_entry, baseline_entry])
    except ValueError:
        print(f"Non-numeric data detected for participant {participant_id}. Skipping this participant.")

# Create a DataFrame
cognitive_load_df = pd.DataFrame(cognitive_load_data)

# Ensure all relevant columns are numeric
cognitive_load_df[['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']] = cognitive_load_df[['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']].apply(pd.to_numeric, errors='coerce')

# Manually calculate mean and std for each cognitive load factor per technique
mean_std_mental_demand = cognitive_load_df.groupby('VisualizationTechnique')['MentalDemand'].agg(['mean', 'std']).reset_index()
mean_std_temporal_demand = cognitive_load_df.groupby('VisualizationTechnique')['TemporalDemand'].agg(['mean', 'std']).reset_index()
mean_std_effort = cognitive_load_df.groupby('VisualizationTechnique')['Effort'].agg(['mean', 'std']).reset_index()
mean_std_frustration = cognitive_load_df.groupby('VisualizationTechnique')['Frustration'].agg(['mean', 'std']).reset_index()

# Print the results
print("Mean and Standard Deviation for Mental Demand by VisualizationTechnique:")
print(mean_std_mental_demand)
print("\nMean and Standard Deviation for Temporal Demand by VisualizationTechnique:")
print(mean_std_temporal_demand)
print("\nMean and Standard Deviation for Effort by VisualizationTechnique:")
print(mean_std_effort)
print("\nMean and Standard Deviation for Frustration by VisualizationTechnique:")
print(mean_std_frustration)

# Step 3: Create boxplots for each of the cognitive load factors, side by side for Analogy and Baseline
cognitive_load_attributes = ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']
for attr in cognitive_load_attributes:
    fig = px.box(cognitive_load_df, x='VisualizationTechnique', y=attr, color='VisualizationTechnique',
                 title=f'Boxplot for {attr}',
                 labels={attr: attr, 'VisualizationTechnique': 'Technique'},
                 color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'})
    fig.show()


Mean and Standard Deviation for Mental Demand by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy  4.028986  1.782026
1               Baseline  4.275362  1.773034

Mean and Standard Deviation for Temporal Demand by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy  2.681159  1.567058
1               Baseline  2.710145  1.572759

Mean and Standard Deviation for Effort by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy  5.202899  1.471091
1               Baseline  5.057971  1.513508

Mean and Standard Deviation for Frustration by VisualizationTechnique:
  VisualizationTechnique      mean       std
0                Analogy  2.826087  1.740153
1               Baseline  2.594203  1.498223


In [185]:
import pandas as pd

# Assume your data is in a DataFrame called 'df'
# Columns: 'ParticipantID', 'Group', 'Order', 'Technique', 'MentalDemand', 'TemporalDemand', 'Effort', 'Frustration'

# Calculate descriptive statistics for each phase and technique
cognitive_load_stats = cognitive_load_df.groupby(['Order', 'VisualizationTechnique'])[['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']].agg(['mean', 'std'])
print(cognitive_load_stats)


                              MentalDemand           TemporalDemand            \
                                      mean       std           mean       std   
Order  VisualizationTechnique                                                   
first  Analogy                    3.878788  1.763476       2.333333  1.136515   
       Baseline                   4.472222  1.647268       2.944444  1.620308   
second Analogy                    4.166667  1.812654       3.000000  1.836145   
       Baseline                   4.060606  1.902948       2.454545  1.501893   

                                 Effort           Frustration            
                                   mean       std        mean       std  
Order  VisualizationTechnique                                            
first  Analogy                 5.303030  1.402784    2.242424  1.299767  
       Baseline                5.055556  1.602577    2.527778  1.383290  
second Analogy                 5.111111  1.545090    3.361111 

In [186]:


from scipy.stats import ttest_ind

In [187]:

print(len(cognitive_load_df))  # Ensure the sample sizes are equal

# Extract data for Phase 1's standalone comparison
group1_baseline = cognitive_load_df[(cognitive_load_df['Order'] == 'first') & (cognitive_load_df['VisualizationTechnique'] == 'Baseline')]
group2_analogy = cognitive_load_df[(cognitive_load_df['Order'] == 'first') & (cognitive_load_df['VisualizationTechnique'] == 'Analogy')]

print(len(group1_baseline), len(group2_analogy))  # Ensure the sample sizes are equal

# Perform t-tests for each cognitive load component
for component in ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']:
    t_stat, p_val = ttest_ind(group1_baseline[component], group2_analogy[component], equal_var=False)  # assume unequal variances
    print(f'{component} - t-statistic: {t_stat}, p-value: {p_val}')


138
36 33
MentalDemand - t-statistic: 1.4409323930906925, p-value: 0.15437424376314063
TemporalDemand - t-statistic: 1.8254806369222614, p-value: 0.07268041505091231
Effort - t-statistic: -0.6838229591787429, p-value: 0.4964499700448731
Frustration - t-statistic: 0.8833741843780323, p-value: 0.3801950404521689


In [188]:
# Extract data for Phase 2's post-exposure comparison
group1_analogy_after_baseline = cognitive_load_df[(cognitive_load_df['Order'] == 'second') & (cognitive_load_df['VisualizationTechnique'] == 'Analogy')]
group2_baseline_after_analogy = cognitive_load_df[(cognitive_load_df['Order'] == 'second') & (cognitive_load_df['VisualizationTechnique'] == 'Baseline')]

# Perform t-tests for each cognitive load component
for component in ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']:
    t_stat, p_val = ttest_ind(group1_analogy_after_baseline[component], group2_baseline_after_analogy[component], equal_var=False)
    print(f'{component} (Post-Exposure) - t-statistic: {t_stat}, p-value: {p_val}')

MentalDemand (Post-Exposure) - t-statistic: 0.23656595790439447, p-value: 0.8137287690091453
TemporalDemand (Post-Exposure) - t-statistic: 1.3551726867530147, p-value: 0.17997028612758986
Effort (Post-Exposure) - t-statistic: 0.14078194873465166, p-value: 0.8884647001414407
Frustration (Post-Exposure) - t-statistic: 1.6179422022400172, p-value: 0.11040224116760197


In [189]:
# Group data by VisualizationTechnique without considering order
analogy_group = cognitive_load_df[cognitive_load_df['VisualizationTechnique'] == 'Analogy']
baseline_group = cognitive_load_df[cognitive_load_df['VisualizationTechnique'] == 'Baseline']

# Print counts to ensure sample sizes
print(f"Analogy Group Size: {len(analogy_group)}, Baseline Group Size: {len(baseline_group)}")

# Perform t-tests for each cognitive load component
cognitive_load_components = ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']

for component in cognitive_load_components:
    t_stat, p_val = ttest_ind(analogy_group[component], baseline_group[component], equal_var=False)
    print(f'{component} - t-statistic: {t_stat}, p-value: {p_val}')

Analogy Group Size: 69, Baseline Group Size: 69
MentalDemand - t-statistic: -0.8141248832112583, p-value: 0.4169966397661107
TemporalDemand - t-statistic: -0.10844646816735494, p-value: 0.9138013802071465
Effort - t-statistic: 0.5703750694371392, p-value: 0.5693652235652642
Frustration - t-statistic: 0.838831230622745, p-value: 0.4030684388224701


In [190]:
import pandas as pd
import plotly.express as px

# Assuming 'data' is your DataFrame already loaded with relevant data
# Construct the dataset with the required columns
cognitive_load_data = []

for index, row in data.iterrows():
    participant_id = row['PROLIFIC_PID']
    group = row['Group']

    # Collecting cognitive load responses, ensuring they are cast to numeric
    try:
        analogy_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Analogy',
            'Order': 'first' if (group == '1') else 'second',
            'MentalDemand': float(row['Mental Demand-A_12']),
            'TemporalDemand': float(row['Temporal Demand-A_12']),
            'Effort': float(row['Effort-A_12']),
            'Frustration': float(row['Frustration-A_12'])
        }
        baseline_entry = {
            'ParticipantID': participant_id,
            'VisualizationTechnique': 'Baseline',
            'Order': 'first' if (group == '2') else 'second',
            'MentalDemand': float(row['Mental Demand-B_12']),
            'TemporalDemand': float(row['Temporal Demand-B_12']),
            'Effort': float(row['Effort-B_12']),
            'Frustration': float(row['Frustration-B_12'])
        }
        cognitive_load_data.extend([analogy_entry, baseline_entry])
    except ValueError:
        print(f"Non-numeric data detected for participant {participant_id}. Skipping this participant.")

# Create a DataFrame
cognitive_load_df = pd.DataFrame(cognitive_load_data)

# Ensure all relevant columns are numeric
cognitive_load_df[['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']] = cognitive_load_df[['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']].apply(pd.to_numeric, errors='coerce')

# Manually calculate mean and std for each cognitive load factor per technique
mean_std_mental_demand = cognitive_load_df.groupby(['VisualizationTechnique', 'Order'])['MentalDemand'].agg(['mean', 'std']).reset_index()
mean_std_temporal_demand = cognitive_load_df.groupby(['VisualizationTechnique', 'Order'])['TemporalDemand'].agg(['mean', 'std']).reset_index()
mean_std_effort = cognitive_load_df.groupby(['VisualizationTechnique', 'Order'])['Effort'].agg(['mean', 'std']).reset_index()
mean_std_frustration = cognitive_load_df.groupby(['VisualizationTechnique', 'Order'])['Frustration'].agg(['mean', 'std']).reset_index()

# Print the results
print("Mean and Standard Deviation for Mental Demand by Visualization Technique and Order:")
print(mean_std_mental_demand)
print("\nMean and Standard Deviation for Temporal Demand by Visualization Technique and Order:")
print(mean_std_temporal_demand)
print("\nMean and Standard Deviation for Effort by Visualization Technique and Order:")
print(mean_std_effort)
print("\nMean and Standard Deviation for Frustration by Visualization Technique and Order:")
print(mean_std_frustration)

# Step 3: Create boxplots for each of the cognitive load factors, showing both order and technique
cognitive_load_attributes = ['MentalDemand', 'TemporalDemand', 'Effort', 'Frustration']
for attr in cognitive_load_attributes:
    fig = px.box(cognitive_load_df, x='Order', y=attr, color='VisualizationTechnique',
                 title=f'Boxplot for {attr} by Visualization Technique and Order',
                 labels={attr: attr, 'Order': 'Order', 'VisualizationTechnique': 'Technique'},
                 color_discrete_map={'Analogy': 'blue', 'Baseline': 'red'},
                 category_orders={'Order': ['first', 'second']})
    fig.show()


Mean and Standard Deviation for Mental Demand by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  3.878788  1.763476
1                Analogy  second  4.166667  1.812654
2               Baseline   first  4.472222  1.647268
3               Baseline  second  4.060606  1.902948

Mean and Standard Deviation for Temporal Demand by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  2.333333  1.136515
1                Analogy  second  3.000000  1.836145
2               Baseline   first  2.944444  1.620308
3               Baseline  second  2.454545  1.501893

Mean and Standard Deviation for Effort by Visualization Technique and Order:
  VisualizationTechnique   Order      mean       std
0                Analogy   first  5.303030  1.402784
1                Analogy  second  5.111111  1.545090
2               Baseline   first  5.055556  1.602577
3        