In [22]:
import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_excel('wrangled_reading_score_data.xlsx')

# Identify all columns that contain "Opinion" but not "Author" in their names
opinion_columns = [col for col in df.columns if 'Opinion' in col and 'Author' not in col and 'Type' not in col]
author_columns = [col for col in df.columns if 'Author' in col]

# Convert these columns to floats
for col in opinion_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
# Convert author columns to strings
for col in author_columns:
    df[col] = df[col].astype(str)
    df[col] = df[col].str.replace('’', "'")
    df[col] = df[col].str.replace(r'\bGinsbur\b', 'Ginsburg', regex=True)

# Convert the remaining score columns to floats
df['Syllabus F-K Score'] = pd.to_numeric(df['Syllabus F-K Score'], errors='coerce')
df['Syllabus FRE Score'] = pd.to_numeric(df['Syllabus FRE Score'], errors='coerce')

#Strip whitespace from author columns
for col in author_columns:
    df[col] = df[col].str.replace('Justuce', 'Justice').str.strip()
for col in author_columns:
    df[col] = df[col].str.replace('Justicw', 'Justice').str.strip()
for col in author_columns:
    df[col] = df[col].str.replace('Ginsburg', 'Ginsberg').str.strip()
df.head()

Unnamed: 0,Case,Year,Syllabus FRE Score,Syllabus F-K Score,Opinion 1 FRE Score,Opinion 1 F-K Score,Opinion 1 Author(s),Opinion 2 FRE Score,Opinion 2 F-K Score,Opinion 2 Author(s),...,Opinion 7 Author(s),Opinion 7 Type,Opinion 8 FRE Score,Opinion 8 F-K Score,Opinion 8 Author(s),Opinion 8 Type,Opinion 9 FRE Score,Opinion 9 F-K Score,Opinion 9 Author(s),Opinion 9 Type
0,Postal Service v. Gregory,2001,45.356667,11.266667,46.713333,1.966667,Justice O'Connor,47.4975,1.95,Justice Thomas,...,,,,,,,,,,
1,TRW INC. v. ANDREWS,2001,43.84,12.5,5.292377,1.653846,Justice Ginsberg,68.2,7.94,Justice Scalia,...,,,,,,,,,,
2,CORRECTIONAL SERVICES CORP. v. MALESKO,2001,44.19,11.7,49.92,1.366667,Chief Justice Rehnquist,67.76,6.8,Justice Scalia,...,,,,,,,,,,
3,CHICKASAW NATION v. UNITED STATES,2001,49.76,1.233333,5.888333,9.883333,Justice Breyer,5.227143,1.285714,Justice O'Connor,...,,,,,,,,,,
4,"ADARAND CONSTRUCTORS, INC. v. MINETA,_x000D_\n...",2001,56.0,9.25,54.238571,9.314286,Per Curiam,,,,...,,,,,,,,,,


In [23]:
# Identify all columns that contain "Opinion" and "Type" in their names
opinion_type_columns = [col for col in df.columns if 'Opinion' in col and 'Type' in col]
for col in opinion_type_columns:
    df[col] = df[col].astype(str)
df['Opinion 2 Type'].unique()

# Create the "Controversial" flag
df['Controversial'] = df[opinion_type_columns].apply(lambda row: (row == 'dissenting').sum() >= 2, axis=1).astype(int)
df[df['Controversial'] == 1]

Unnamed: 0,Case,Year,Syllabus FRE Score,Syllabus F-K Score,Opinion 1 FRE Score,Opinion 1 F-K Score,Opinion 1 Author(s),Opinion 2 FRE Score,Opinion 2 F-K Score,Opinion 2 Author(s),...,Opinion 7 Type,Opinion 8 FRE Score,Opinion 8 F-K Score,Opinion 8 Author(s),Opinion 8 Type,Opinion 9 FRE Score,Opinion 9 F-K Score,Opinion 9 Author(s),Opinion 9 Type,Controversial
31,"MICKENS v. TAYLOR, WARDEN",2001,47.330000,1.500000,47.515385,11.215385,Justice Scalia,49.277500,1.775000,Justice Kennedy,...,,,,,,,,,,1
36,UNITED STATES v. CRAFT,2001,49.910000,11.550000,58.135714,9.157143,Justice O'Connor,51.780000,1.900000,Justice Scalia,...,,,,,,,,,,1
37,"TAHOE-SIERRA PRESERVATION COUNCIL, INC.,_x000D...",2001,44.750000,11.775000,47.378919,11.600000,Justice Stevens,49.488333,1.516667,Chief Justice Rehnquist,...,,,,,,,,,,1
39,"US AIRWAYS, INC. v. BARNETT",2001,39.136667,12.266667,43.485714,11.242857,Justice Breyer,49.150000,11.900000,Justice Stevens,...,,,,,,,,,,1
50,FEDERAL MARITIME COMMISSION v. SOUTH_x000D_\nC...,2001,41.835000,13.125000,49.227391,11.213435,Justice Thomas,53.496667,9.533333,Justice Stevens,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662,"SAMIA, aka SAMIC v. UNITED STATES",2022,53.368000,1.260000,53.640000,1.394118,Justice Thomas,65.593333,7.633333,Justice Barrett,...,,,,,,,,,,1
1667,COUNTERMAN v. COLORADO,2022,63.295000,7.475000,63.855714,7.542857,Justice Kagan,61.231818,8.260000,Justice Sotomayor,...,,,,,,,,,,1
1669,"STUDENTS FOR FAIR ADMISSIONS, INC. v._x000D_\n...",2022,58.737692,8.576924,6.892895,8.339474,Chief Justice Roberts,54.558718,9.758772,Justice Thomas,...,,,,,,,,,,1
1684,"WILKINSON v. GARLAND, ATTORNEY GENERAL",2023,47.836667,11.000000,53.233571,9.850000,Justice Sotomayor,57.520000,8.700000,Justice Jackson,...,,,,,,,,,,1


In [24]:
unique_justices_opinion1 = df['Opinion 1 Author(s)'].nunique()
unique_justices_opinion1


19

In [25]:
cases_per_year = df.groupby('Year').size()
cases_per_year

dobbs_case = df[df['Case'].str.contains('dobbs', case=False, na=False)]
dobbs_case

Unnamed: 0,Case,Year,Syllabus FRE Score,Syllabus F-K Score,Opinion 1 FRE Score,Opinion 1 F-K Score,Opinion 1 Author(s),Opinion 2 FRE Score,Opinion 2 F-K Score,Opinion 2 Author(s),...,Opinion 7 Type,Opinion 8 FRE Score,Opinion 8 F-K Score,Opinion 8 Author(s),Opinion 8 Type,Opinion 9 FRE Score,Opinion 9 F-K Score,Opinion 9 Author(s),Opinion 9 Type,Controversial
1608,"DOBBS, STATE HEALTH OFFICER OF THE MISSISSIPPI...",2021,61.967692,8.538462,6.768533,8.729333,Justice Alito,61.896667,7.666667,Justice Thomas,...,,,,,,,,,,0


In [26]:
# Create line chart for F-K scores
import pandas as pd
import altair as alt

avg_scores = df.groupby('Year').agg({'Opinion 1 F-K Score': 'mean', 'Opinion 1 FRE Score': 'mean'}).reset_index()

# Create the nearest selection
nearest = alt.selection_point(on='mouseover', nearest=True, empty='none', fields=['Year'])

# Calculate the y-axis range for F-K scores
fk_min = avg_scores['Opinion 1 F-K Score'].min()
fk_max = avg_scores['Opinion 1 F-K Score'].max()
fk_range = [fk_min - (fk_max - fk_min) * 0.1, fk_max + (fk_max - fk_min) * 0.1]

# Create line chart for F-K scores
fk_line = alt.Chart(avg_scores).mark_line(strokeWidth=3).encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('Opinion 1 F-K Score:Q', title='Average F-K Score', scale=alt.Scale(domain=fk_range))
)

# Create scatter plot for F-K scores with points
fk_points = alt.Chart(avg_scores).mark_point(size=100).encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('Opinion 1 F-K Score:Q', title='Average F-K Score', scale=alt.Scale(domain=fk_range)),
    tooltip=[
        alt.Tooltip('Year:O', title='Year'),
        alt.Tooltip('Opinion 1 F-K Score:Q', title='Average F-K Score')
    ],
    opacity=alt.condition(nearest, alt.value(1), alt.value(0.5))
).add_params(
    nearest
)

# Combine the line chart and points for F-K chart
fk_chart = alt.layer(fk_line, fk_points).properties(
    title='Average F-K Score per Year',
    width=700,
    height=400
)

# Calculate the y-axis range for FRE scores
fre_min = avg_scores['Opinion 1 FRE Score'].min()
fre_max = avg_scores['Opinion 1 FRE Score'].max()
fre_range = [fre_min - (fre_max - fre_min) * 0.1, fre_max + (fre_max - fre_min) * 0.1]

# Create line chart for FRE scores
fre_line = alt.Chart(avg_scores).mark_line(strokeWidth=3).encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('Opinion 1 FRE Score:Q', title='Average FRE Score', scale=alt.Scale(domain=fre_range))
)

# Create scatter plot for FRE scores with points
fre_points = alt.Chart(avg_scores).mark_point(size=100).encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('Opinion 1 FRE Score:Q', title='Average FRE Score', scale=alt.Scale(domain=fre_range)),
    tooltip=[
        alt.Tooltip('Year:O', title='Year'),
        alt.Tooltip('Opinion 1 FRE Score:Q', title='Average FRE Score')
    ],
    opacity=alt.condition(nearest, alt.value(1), alt.value(0.5))
).add_params(
    nearest
)

# Combine
fre_chart = alt.layer(fre_line, fre_points).properties(
    title='Average FRE Score per Year',
    width=700,
    height=400
)

# Display
fre_chart & fk_chart



In [27]:
### Combine all opinions and authors into respective columns
# Concatenate data for all opinions
opinion_columns = [col for col in df.columns if 'Opinion' in col and 'Score' in col]
author_columns = [col for col in df.columns if 'Opinion' in col and 'Author(s)' in col]
# Ensure the Year column remains throughout this transformation
long_df = pd.DataFrame()
for i in range(1, 10):  # Assuming there are up to 9 opinions
    fre_col = f'Opinion {i} FRE Score'
    fk_col = f'Opinion {i} F-K Score'
    author_col = f'Opinion {i} Author(s)'
    if fre_col in df.columns and fk_col in df.columns and author_col in df.columns:
        temp_df = df[['Year', fre_col, fk_col, author_col, 'Controversial']].copy()
        temp_df.columns = ['Year', 'FRE Score', 'F-K Score', 'Author(s)', 'Controversial']
        temp_df['Opinion'] = f'Opinion {i}'
        long_df = pd.concat([long_df, temp_df], ignore_index=True)


# If multiple authors are attributed to an opinion, only keep the first author
long_df['Author(s)'] = long_df['Author(s)'].str.split(',').str[0]
long_df['Author(s)'] = long_df['Author(s)'].apply(lambda x: x.split(' and ')[0])
#Drop all null authors
long_df = long_df.dropna()
long_df

Unnamed: 0,Year,FRE Score,F-K Score,Author(s),Controversial,Opinion
0,2001,46.713333,1.966667,Justice O'Connor,0,Opinion 1
1,2001,5.292377,1.653846,Justice Ginsberg,0,Opinion 1
2,2001,49.920000,1.366667,Chief Justice Rehnquist,0,Opinion 1
3,2001,5.888333,9.883333,Justice Breyer,0,Opinion 1
4,2001,54.238571,9.314286,Per Curiam,0,Opinion 1
...,...,...,...,...,...,...
10434,2002,47.828750,1.300000,Justice Ginsberg,1,Opinion 7
10791,2007,57.690000,9.145455,Justice Ginsberg,0,Opinion 7
10799,2007,73.370000,6.700000,Justice Alito,1,Opinion 7
10997,2009,46.942222,1.888889,Justice Thomas,0,Opinion 7


In [28]:
distinct_authors = long_df['Author(s)'].unique()

# Aggregate the data to calculate the year range and total opinions for each author
author_summary = (
    long_df.groupby('Author(s)')
    .agg(
        min_year=('Year', 'min'),
        max_year=('Year', 'max'),
        total_count=('Year', 'size'),
        min_score_fre=('FRE Score', 'min'),
        max_score_fre=('FRE Score', 'max'),
        avg_score_fre=('FRE Score', 'mean'),
        min_score_fk=('F-K Score', 'min'),
        max_score_fk=('F-K Score', 'max'),
        avg_score_fk=('F-K Score', 'mean')
    )
    .reset_index()
)

# Add a column for the year band
author_summary['Year_Band'] = author_summary['min_year'].astype(str) + ' - ' + author_summary['max_year'].astype(str)

# Enable full data transformation
alt.data_transformers.enable('default', max_rows=None)

# Create a violin plot for Opinion 1 FRE scores by authors
violin_plot_fre = alt.Chart(long_df).transform_filter(
    alt.FieldOneOfPredicate(field='Author(s)', oneOf=distinct_authors)
).transform_density(
    density='FRE Score',
    as_=['FRE Score', 'density'],
    extent=[long_df['FRE Score'].min(), long_df['FRE Score'].max()],
    groupby=['Author(s)']
).transform_lookup(
    lookup='Author(s)',
    from_=alt.LookupData(author_summary, 'Author(s)', ['Year_Band', 'total_count', 'min_score_fre', 'max_score_fre', 'avg_score_fre'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('FRE Score:Q'),
    alt.Color('Author(s):N', scale=alt.Scale(scheme='category20')),  # Use a color scheme with enough distinct colors
    alt.Column('Author(s):N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('Author(s):N', title='Author'),
        alt.Tooltip('Year_Band:N', title='Years Active'),
        alt.Tooltip('total_count:Q', title='Total Opinions Authored'),
        alt.Tooltip('min_score_fre:Q', title='Min Score'),
        alt.Tooltip('max_score_fre:Q', title='Max Score'),
        alt.Tooltip('avg_score_fre:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of FRE Scores by Author',
    width=100,
    height=400
)

# Create a violin plot for Opinion 1 F-K scores by authors
violin_plot_fk = alt.Chart(long_df).transform_filter(
    alt.FieldOneOfPredicate(field='Author(s)', oneOf=distinct_authors)
).transform_density(
    density='F-K Score',
    as_=['F-K Score', 'density'],
    extent=[long_df['F-K Score'].min(), long_df['F-K Score'].max()],
    groupby=['Author(s)']
).transform_lookup(
    lookup='Author(s)',
    from_=alt.LookupData(author_summary, 'Author(s)', ['Year_Band', 'total_count', 'min_score_fk', 'max_score_fk', 'avg_score_fk'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('F-K Score:Q'),
    alt.Color('Author(s):N', scale=alt.Scale(scheme='category20')),  # Use a color scheme with enough distinct colors
    alt.Column('Author(s):N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('Author(s):N', title='Author'),
        alt.Tooltip('Year_Band:N', title='Years Active'),
        alt.Tooltip('total_count:Q', title='Total Opinions Authored'),
        alt.Tooltip('min_score_fk:Q', title='Min Score'),
        alt.Tooltip('max_score_fk:Q', title='Max Score'),
        alt.Tooltip('avg_score_fk:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of F-K Scores by Author',
    width=100,
    height=400
)

# Concatenate the violin plots vertically and configure the view at the top level
violin_plots = alt.vconcat(violin_plot_fre, violin_plot_fk).configure_view(
    stroke=None
)

# Display the violin plots
violin_plots

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [29]:
# Calculate summary statistics for Controversial and Non-Controversial cases (FRE)
aggregated_fre = df.groupby(['Controversial'])['Opinion 1 FRE Score'].agg(
    total_count='size',
    min_score='min',
    max_score='max',
    avg_score='mean'
).reset_index()

# Calculate summary statistics for Controversial and Non-Controversial cases (F-K)
aggregated_fk = df.groupby(['Controversial'])['Opinion 1 F-K Score'].agg(
    total_count='size',
    min_score='min',
    max_score='max',
    avg_score='mean'
).reset_index()

# Create a violin plot for Opinion 1 FRE scores by Controversial flag
violin_plot_controversial_fre = alt.Chart(df).transform_density(
    density='Opinion 1 FRE Score',
    as_=['Opinion 1 FRE Score', 'density'],
    extent=[df['Opinion 1 FRE Score'].min(), df['Opinion 1 FRE Score'].max()],
    groupby=['Controversial']
).transform_lookup(
    lookup='Controversial',
    from_=alt.LookupData(aggregated_fre, 'Controversial', ['total_count', 'min_score', 'max_score', 'avg_score'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('Opinion 1 FRE Score:Q'),
    alt.Color('Controversial:N', scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#d62728'])),  # Blue for 0, Red for 1
    alt.Column('Controversial:N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('Controversial:N', title='Controversial'),
        alt.Tooltip('total_count:Q', title='Total Count'),
        alt.Tooltip('min_score:Q', title='Min Score'),
        alt.Tooltip('max_score:Q', title='Max Score'),
        alt.Tooltip('avg_score:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of Opinion 1 FRE Scores by Controversial Flag',
    width=100,
    height=400
)

# Create a violin plot for Opinion 1 F-K scores by Controversial flag
violin_plot_controversial_fk = alt.Chart(df).transform_density(
    density='Opinion 1 F-K Score',
    as_=['Opinion 1 F-K Score', 'density'],
    extent=[df['Opinion 1 F-K Score'].min(), df['Opinion 1 F-K Score'].max()],
    groupby=['Controversial']
).transform_lookup(
    lookup='Controversial',
    from_=alt.LookupData(aggregated_fk, 'Controversial', ['total_count', 'min_score', 'max_score', 'avg_score'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('Opinion 1 F-K Score:Q'),
    alt.Color('Controversial:N', scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#d62728'])),  # Blue for 0, Red for 1
    alt.Column('Controversial:N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('Controversial:N', title='Controversial'),
        alt.Tooltip('total_count:Q', title='Total Count'),
        alt.Tooltip('min_score:Q', title='Min Score'),
        alt.Tooltip('max_score:Q', title='Max Score'),
        alt.Tooltip('avg_score:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of Opinion 1 F-K Scores by Controversial Flag',
    width=100,
    height=400
)

# Concatenate the violin plots for controversial flag vertically and configure the view at the top level
violin_plots_controversial = alt.hconcat(
    violin_plot_controversial_fre, 
    violin_plot_controversial_fk
).configure_view(
    stroke=None
)

# Display both visualizations side by side
violin_plots_controversial.display()

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [30]:
aggregated_fre = long_df.groupby(['Controversial'])['FRE Score'].agg(
    total_count='size',
    min_score='min',
    max_score='max',
    avg_score='mean'
).reset_index()

# Calculate summary statistics for Controversial and Non-Controversial cases (F-K)
aggregated_fk = long_df.groupby(['Controversial'])['F-K Score'].agg(
    total_count='size',
    min_score='min',
    max_score='max',
    avg_score='mean'
).reset_index()

# Create a violin plot for FRE scores by Controversial flag
violin_plot_controversial_fre = alt.Chart(long_df).transform_filter(
    alt.FieldOneOfPredicate(field='Controversial', oneOf=[0, 1])
).transform_density(
    density='FRE Score',
    as_=['FRE Score', 'density'],
    extent=[long_df['FRE Score'].min(), long_df['FRE Score'].max()],
    groupby=['Controversial']
).transform_lookup(
    lookup='Controversial',
    from_=alt.LookupData(aggregated_fre, 'Controversial', ['total_count', 'min_score', 'max_score', 'avg_score'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('FRE Score:Q'),
    alt.Color('Controversial:N',
        scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#d62728']),  # Blue for 0, Red for 1
        legend=alt.Legend(
            title="Controversy",
            values=[0, 1],
            labelExpr="datum.value == 0 ? 'Not Controversial' : 'Controversial'"
        )
    ),
    alt.Column('Controversial:N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('total_count:Q', title='Total Count'),
        alt.Tooltip('min_score:Q', title='Min Score'),
        alt.Tooltip('max_score:Q', title='Max Score'),
        alt.Tooltip('avg_score:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of FRE Scores by Controversial Flag',
    width=100,
    height=400
)

# Create a violin plot for F-K scores by Controversial flag
violin_plot_controversial_fk = alt.Chart(long_df).transform_filter(
    alt.FieldOneOfPredicate(field='Controversial', oneOf=[0, 1])
).transform_density(
    density='F-K Score',
    as_=['F-K Score', 'density'],
    extent=[long_df['F-K Score'].min(), long_df['F-K Score'].max()],
    groupby=['Controversial']
).transform_lookup(
    lookup='Controversial',
    from_=alt.LookupData(aggregated_fk, 'Controversial', ['total_count', 'min_score', 'max_score', 'avg_score'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('F-K Score:Q'),
    alt.Color('Controversial:N',
        scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#d62728']),  # Blue for 0, Red for 1
        legend=alt.Legend(
            title="Controversy",
            values=[0, 1],
            labelExpr="datum.value == 0 ? 'Not Controversial' : 'Controversial'"
        )
    ),
    alt.Column('Controversial:N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('total_count:Q', title='Total Count'),
        alt.Tooltip('min_score:Q', title='Min Score'),
        alt.Tooltip('max_score:Q', title='Max Score'),
        alt.Tooltip('avg_score:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of F-K Scores by Controversial Flag',
    width=100,
    height=400
)

# Concatenate the violin plots for controversial flag vertically and configure the view at the top level
violin_plots_controversial = alt.hconcat(
    violin_plot_controversial_fre, 
    violin_plot_controversial_fk
).configure_view(
    stroke=None
)

# Display both visualizations side by side
violin_plots_controversial.display()

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [31]:
# Aggregate the data to get the total case count for each author, split by controversial and non-controversial cases
author_case_stats = long_df.groupby(['Author(s)', 'Controversial']).agg(
    case_count=('F-K Score', 'size'),
    mean_fk_score=('F-K Score', 'mean'),
    min_fk_score=('F-K Score', 'min'),
    max_fk_score=('F-K Score', 'max'),
    mean_fre_score=('FRE Score', 'mean'),
    min_fre_score=('FRE Score', 'min'),
    max_fre_score=('FRE Score', 'max')
).reset_index()

# Create the stacked bar chart
stacked_bar_chart = alt.Chart(author_case_stats).mark_bar().encode(
    x=alt.X('Author(s):N', title='Author', sort=alt.EncodingSortField(field='case_count', op='sum', order='descending')),
    y=alt.Y('case_count:Q', title='Total Case Count'),
    color=alt.Color('Controversial:N', title='Controversial', scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#d62728']),
        legend=alt.Legend(
            title="Controversy",
            values=[0, 1],
            labelExpr="datum.value == 0 ? 'Not Controversial' : 'Controversial'"
        )
    ),
    tooltip=[
        alt.Tooltip('Author(s):N', title='Author'),
        alt.Tooltip('case_count:Q', title='Case Count'),
        alt.Tooltip('Controversial:N', title='Controversial'),
        alt.Tooltip('mean_fk_score:Q', title='Mean F-K Score'),
        alt.Tooltip('min_fk_score:Q', title='Min F-K Score'),
        alt.Tooltip('max_fk_score:Q', title='Max F-K Score'),
        alt.Tooltip('mean_fre_score:Q', title='Mean FRE Score'),
        alt.Tooltip('min_fre_score:Q', title='Min FRE Score'),
        alt.Tooltip('max_fre_score:Q', title='Max FRE Score')
    ]
).properties(
    title='Total Case Count by Author (Stacked by Controversial and Non-Controversial Cases)',
    width=800,
    height=400
).configure_axis(
    labelAngle=-45
)

# Display the chart
stacked_bar_chart.display()

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [58]:
#Calculate average FRE and F-K scores per year per author
single_avg = (
    long_df.groupby(['Year', 'Author(s)'])
    .agg(
        FRE_Score=('FRE Score', 'mean'),
        FK_Score=('F-K Score', 'mean')
    )
    .reset_index()
)

#Tooltip selection for nearest
nearest = alt.selection(type='single', nearest=True, on='mouseover', fields=['Year'], empty='none')

#Line chart for FRE scores over time by author
line_chart_fre = alt.Chart(single_avg).mark_line().encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('FRE_Score:Q', title='FRE Score', scale=alt.Scale(domain=[35, 75])),
    color=alt.Color('Author(s):N', scale=alt.Scale(scheme='category20'))
).properties(
    title='Line Chart of FRE Scores by Author Over Time',
    width=600,
    height=400
)

#Points and tooltips for FRE chart
fre_points = line_chart_fre.mark_point().encode(
    tooltip=['Year', 'FRE_Score', 'Author(s)']
).add_selection(nearest)

fre_chart_with_tooltip = line_chart_fre + fre_points

#Line chart for F-K scores over time by author
line_chart_fk = alt.Chart(single_avg).mark_line().encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('FK_Score:Q', title='F-K Score', scale=alt.Scale(domain=[4, 12])),
    color=alt.Color('Author(s):N', scale=alt.Scale(scheme='category20'))
).properties(
    title='Line Chart of F-K Scores by Author Over Time',
    width=600,
    height=400
)

#Points and tooltips for F-K chart
fk_points = line_chart_fk.mark_point().encode(
    tooltip=['Year', 'FK_Score', 'Author(s)']
).add_selection(nearest)

fk_chart_with_tooltip = line_chart_fk + fk_points

#Combine
line_charts = alt.vconcat(fre_chart_with_tooltip, fk_chart_with_tooltip).configure_view(
    stroke=None
)

#Display
line_charts

   Use 'selection_point()' or 'selection_interval()' instead; these functions also include more helpful docstrings.
        combined and should be specified using "selection_point()".
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
