In [81]:
import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_excel('wrangled_reading_score_data.xlsx')

# Identify all columns that contain "Opinion" but not "Author" in their names
opinion_columns = [col for col in df.columns if 'Opinion' in col and 'Author' not in col and 'Type' not in col]
author_columns = [col for col in df.columns if 'Author' in col]

# Convert these columns to floats
for col in opinion_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
# Convert author columns to strings
for col in author_columns:
    df[col] = df[col].astype(str)
    df[col] = df[col].str.replace('’', "'")
    df[col] = df[col].str.replace(r'\bGinsbur\b', 'Ginsburg', regex=True)

# Convert the remaining score columns to floats
df['Syllabus F-K Score'] = pd.to_numeric(df['Syllabus F-K Score'], errors='coerce')
df['Syllabus FRE Score'] = pd.to_numeric(df['Syllabus FRE Score'], errors='coerce')

#Strip whitespace from author columns
for col in author_columns:
    df[col] = df[col].str.replace('Justuce', 'Justice').str.strip()
for col in author_columns:
    df[col] = df[col].str.replace('Justicw', 'Justice').str.strip()
for col in author_columns:
    df[col] = df[col].str.replace('Ginsburg', 'Ginsberg').str.strip()
df.head()

Unnamed: 0,Case,Year,Syllabus FRE Score,Syllabus F-K Score,Opinion 1 FRE Score,Opinion 1 F-K Score,Opinion 1 Author(s),Opinion 2 FRE Score,Opinion 2 F-K Score,Opinion 2 Author(s),...,Opinion 7 Author(s),Opinion 7 Type,Opinion 8 FRE Score,Opinion 8 F-K Score,Opinion 8 Author(s),Opinion 8 Type,Opinion 9 FRE Score,Opinion 9 F-K Score,Opinion 9 Author(s),Opinion 9 Type
0,ARTHUR ANDERSEN LLP v. UNITED STATES,2004,46.47,1.85,49.614545,10.0,Chief Justice Rehnquist,,,,...,,,,,,,,,,
1,"NIJHAWAN v. HOLDER, ATTORNEY GENERAL",2008,5.713333,9.866667,56.671818,10.0,Justice Breyer,,,,...,,,,,,,,,,
2,FLORIDA v. HARRIS,2012,48.273333,1.833333,6.491818,9.999999,Justice Kagan,,,,...,,,,,,,,,,
3,"VIRGINIA URANIUM, INC., et al. v. WARREN et al.",2018,53.8125,1.5,54.4375,9.99375,Justice Gorsuch,52.328333,9.791667,Justice Ginsberg,...,,,,,,,,,,
4,"MERIT MANAGEMENT GROUP, LP v. FTI_x000D_\nCONS...",2017,57.533333,1.733333,58.263333,9.988889,Justice Sotomayor,,,,...,,,,,,,,,,


In [82]:
# Identify all columns that contain "Opinion" and "Type" in their names
opinion_type_columns = [col for col in df.columns if 'Opinion' in col and 'Type' in col]
for col in opinion_type_columns:
    df[col] = df[col].astype(str)
df['Opinion 2 Type'].unique()

# Create the "Controversial" flag
df['Controversial'] = df[opinion_type_columns].apply(lambda row: (row == 'dissenting').sum() >= 2, axis=1).astype(int)
df[df['Controversial'] == 1]

Unnamed: 0,Case,Year,Syllabus FRE Score,Syllabus F-K Score,Opinion 1 FRE Score,Opinion 1 F-K Score,Opinion 1 Author(s),Opinion 2 FRE Score,Opinion 2 F-K Score,Opinion 2 Author(s),...,Opinion 7 Type,Opinion 8 FRE Score,Opinion 8 F-K Score,Opinion 8 Author(s),Opinion 8 Type,Opinion 9 FRE Score,Opinion 9 F-K Score,Opinion 9 Author(s),Opinion 9 Type,Controversial
13,"MUTUAL PHARMACEUTICAL CO., INC. v. BARTLETT",2012,57.636667,8.600000,54.689474,9.978947,Justice Alito,58.416667,8.300000,Justice Breyer,...,,,,,,,,,,1
28,MONTGOMERY v. LOUISIANA,2015,54.600000,9.475000,54.394000,9.955000,Justice Kennedy,55.125000,9.878571,Justice Scalia,...,,,,,,,,,,1
36,CARPENTER v. UNITED STATES,2017,53.250000,9.775000,53.880000,9.947619,Chief Justice Roberts,58.421948,9.300000,Justice Kennedy,...,,,,,,,,,,1
37,"CARSON, as parent and next friend of O. C., et...",2021,59.375000,8.616667,54.137588,9.947588,Chief Justice Roberts,56.276111,9.355556,Justice Breyer,...,,,,,,,,,,1
40,PAROLINE v. UNITED STATES et al.,2013,55.864000,1.540000,58.213333,9.945833,Justice Kennedy,59.810000,9.611111,Chief Justice Roberts,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,"MONCRIEFFE v. HOLDER, ATTORNEY GENERAL",2012,46.773333,1.733333,53.871500,1.160000,Justice Sotomayor,63.833333,8.300000,Justice Thomas,...,,,,,,,,,,1
1679,ALABAMA LEGISLATIVE BLACK CAUCUS et al. v._x00...,2014,48.130000,11.200000,53.380000,1.160000,Justice Breyer,49.528333,1.858333,Justice Scalia,...,,,,,,,,,,1
1688,MURR et al. v. WISCONSIN et al.,2016,51.175000,1.625000,53.895263,1.142153,Justice Kennedy,54.897692,9.976924,Chief Justice Roberts,...,,,,,,,,,,1
1693,"HAMDAN v. RUMSFELD, SECRETARY OF DEFENSE,_x000...",2005,67.423333,9.233333,64.320000,1.124286,Justice Stevens,72.160000,7.200000,Justice Breyer,...,,,,,,,,,,1


In [83]:
unique_justices_opinion1 = df['Opinion 1 Author(s)'].nunique()
unique_justices_opinion1


19

In [84]:
cases_per_year = df.groupby('Year').size()
cases_per_year

dobbs_case = df[df['Case'].str.contains('dobbs', case=False, na=False)]
dobbs_case

Unnamed: 0,Case,Year,Syllabus FRE Score,Syllabus F-K Score,Opinion 1 FRE Score,Opinion 1 F-K Score,Opinion 1 Author(s),Opinion 2 FRE Score,Opinion 2 F-K Score,Opinion 2 Author(s),...,Opinion 7 Type,Opinion 8 FRE Score,Opinion 8 F-K Score,Opinion 8 Author(s),Opinion 8 Type,Opinion 9 FRE Score,Opinion 9 F-K Score,Opinion 9 Author(s),Opinion 9 Type,Controversial
780,"DOBBS, STATE HEALTH OFFICER OF THE MISSISSIPPI...",2021,61.967692,8.538462,6.768533,8.729333,Justice Alito,61.896667,7.666667,Justice Thomas,...,,,,,,,,,,0


In [85]:
### Combine all opinions and authors into respective columns
# Concatenate data for all opinions
opinion_columns = [col for col in df.columns if 'Opinion' in col and 'Score' in col]
author_columns = [col for col in df.columns if 'Opinion' in col and 'Author(s)' in col]
# Ensure the Year column remains throughout this transformation
long_df = pd.DataFrame()
for i in range(1, 10):  # Assuming there are up to 9 opinions
    fre_col = f'Opinion {i} FRE Score'
    fk_col = f'Opinion {i} F-K Score'
    author_col = f'Opinion {i} Author(s)'
    if fre_col in df.columns and fk_col in df.columns and author_col in df.columns:
        temp_df = df[['Year', fre_col, fk_col, author_col, 'Controversial']].copy()
        temp_df.columns = ['Year', 'FRE Score', 'F-K Score', 'Author(s)', 'Controversial']
        temp_df['Opinion'] = f'Opinion {i}'
        long_df = pd.concat([long_df, temp_df], ignore_index=True)


# If multiple authors are attributed to an opinion, only keep the first author
long_df['Author(s)'] = long_df['Author(s)'].str.split(',').str[0]
long_df['Author(s)'] = long_df['Author(s)'].apply(lambda x: x.split(' and ')[0])
#Drop all null authors
long_df = long_df.dropna()
long_df

Unnamed: 0,Year,FRE Score,F-K Score,Author(s),Controversial,Opinion
0,2004,49.614545,10.000000,Chief Justice Rehnquist,0,Opinion 1
1,2008,56.671818,10.000000,Justice Breyer,0,Opinion 1
2,2012,6.491818,9.999999,Justice Kagan,0,Opinion 1
3,2018,54.437500,9.993750,Justice Gorsuch,0,Opinion 1
4,2017,58.263333,9.988889,Justice Sotomayor,0,Opinion 1
...,...,...,...,...,...,...
10789,2018,6.917648,8.823529,Justice Ginsberg,0,Opinion 7
11510,2002,47.828750,1.300000,Justice Ginsberg,1,Opinion 7
11644,2009,46.942222,1.888889,Justice Thomas,0,Opinion 7
11941,2007,73.370000,6.700000,Justice Alito,1,Opinion 7


In [86]:
# Create line chart for F-K scores
import pandas as pd
import altair as alt

avg_scores = long_df.groupby('Year').agg({'F-K Score': 'mean', 'FRE Score': 'mean'}).reset_index()

# Create the nearest selection
nearest = alt.selection_point(on='mouseover', nearest=True, empty='none', fields=['Year'])

# Calculate the y-axis range for F-K scores
fk_min = avg_scores['F-K Score'].min()
fk_max = avg_scores['F-K Score'].max()
fk_range = [fk_min - (fk_max - fk_min) * 0.1, fk_max + (fk_max - fk_min) * 0.1]

# Create line chart for F-K scores
fk_line = alt.Chart(avg_scores).mark_line(strokeWidth=3).encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('F-K Score:Q', title='Average F-K Score', scale=alt.Scale(domain=fk_range))
)

# Create scatter plot for F-K scores with points
fk_points = alt.Chart(avg_scores).mark_point(size=100).encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('F-K Score:Q', title='Average F-K Score', scale=alt.Scale(domain=fk_range)),
    tooltip=[
        alt.Tooltip('Year:O', title='Year'),
        alt.Tooltip('F-K Score:Q', title='Average F-K Score')
    ],
    opacity=alt.condition(nearest, alt.value(1), alt.value(0.5))
).add_params(
    nearest
)

# Combine the line chart and points for F-K chart
fk_chart = alt.layer(fk_line, fk_points).properties(
    title='Average F-K Score per Year',
    width=700,
    height=400
)

# Calculate the y-axis range for FRE scores
fre_min = avg_scores['FRE Score'].min()
fre_max = avg_scores['FRE Score'].max()
fre_range = [fre_min - (fre_max - fre_min) * 0.1, fre_max + (fre_max - fre_min) * 0.1]

# Create line chart for FRE scores
fre_line = alt.Chart(avg_scores).mark_line(strokeWidth=3).encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('FRE Score:Q', title='Average FRE Score', scale=alt.Scale(domain=fre_range))
)

# Create scatter plot for FRE scores with points
fre_points = alt.Chart(avg_scores).mark_point(size=100).encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('FRE Score:Q', title='Average FRE Score', scale=alt.Scale(domain=fre_range)),
    tooltip=[
        alt.Tooltip('Year:O', title='Year'),
        alt.Tooltip('FRE Score:Q', title='Average FRE Score')
    ],
    opacity=alt.condition(nearest, alt.value(1), alt.value(0.5))
).add_params(
    nearest
)

# Combine
fre_chart = alt.layer(fre_line, fre_points).properties(
    title='Average FRE Score per Year',
    width=700,
    height=400
)

# Create an annotation for the year 2005
annotation_fk = alt.Chart(pd.DataFrame({'Year': [2005], 'F-K Score': [avg_scores.loc[avg_scores['Year'] == 2005, 'F-K Score'].values[0]]})).mark_text(
    align='left', dx=-300, dy=0, fontSize=12, fontWeight='bold', text='Chief Justice change'
)

annotation_fre = alt.Chart(pd.DataFrame({'Year': [2005], 'FRE Score': [avg_scores.loc[avg_scores['Year'] == 2005, 'FRE Score'].values[0]]})).mark_text(
    align='left', dx=-300, dy=-185, fontSize=12, fontWeight='bold', text='Chief Justice change'
)

# Add an arrow pointing to 2005 in the F-K chart
arrow_fk = alt.Chart(pd.DataFrame({'Year': [2005], 'F-K Score': [avg_scores.loc[avg_scores['Year'] == 2005, 'F-K Score'].values[0]]})).mark_rule(color='red').encode(
    x='Year:O',
    y='F-K Score:Q'
)

# Add an arrow pointing to 2005 in the FRE chart
arrow_fre = alt.Chart(pd.DataFrame({'Year': [2005], 'FRE Score': [avg_scores.loc[avg_scores['Year'] == 2005, 'FRE Score'].values[0]]})).mark_rule(color='red').encode(
    x='Year:O',
    y='FRE Score:Q'
)

# Combine the annotation and arrows with the existing charts
fre_chart = alt.layer(fre_line, fre_points, arrow_fre, annotation_fre).properties(
    title='Average FRE Score per Year',
    width=700,
    height=400
)

fk_chart = alt.layer(fk_line, fk_points, arrow_fk, annotation_fk).properties(
    title='Average F-K Score per Year',
    width=700,
    height=400
)

# Display
fre_chart & fk_chart


In [87]:
distinct_authors = long_df['Author(s)'].unique()

# Aggregate the data to calculate the year range and total opinions for each author
author_summary = (
    long_df.groupby('Author(s)')
    .agg(
        min_year=('Year', 'min'),
        max_year=('Year', 'max'),
        total_count=('Year', 'size'),
        min_score_fre=('FRE Score', 'min'),
        max_score_fre=('FRE Score', 'max'),
        avg_score_fre=('FRE Score', 'mean'),
        min_score_fk=('F-K Score', 'min'),
        max_score_fk=('F-K Score', 'max'),
        avg_score_fk=('F-K Score', 'mean')
    )
    .reset_index()
)

# Add a column for the year band
author_summary['Year_Band'] = author_summary['min_year'].astype(str) + ' - ' + author_summary['max_year'].astype(str)

# Enable full data transformation
alt.data_transformers.enable('default', max_rows=None)

# Create a violin plot for Opinion 1 FRE scores by authors
violin_plot_fre = alt.Chart(long_df).transform_filter(
    alt.FieldOneOfPredicate(field='Author(s)', oneOf=distinct_authors)
).transform_density(
    density='FRE Score',
    as_=['FRE Score', 'density'],
    extent=[long_df['FRE Score'].min(), long_df['FRE Score'].max()],
    groupby=['Author(s)']
).transform_lookup(
    lookup='Author(s)',
    from_=alt.LookupData(author_summary, 'Author(s)', ['Year_Band', 'total_count', 'min_score_fre', 'max_score_fre', 'avg_score_fre'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('FRE Score:Q'),
    alt.Color('Author(s):N', scale=alt.Scale(scheme='category20')),  # Use a color scheme with enough distinct colors
    alt.Column('Author(s):N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('Author(s):N', title='Author'),
        alt.Tooltip('Year_Band:N', title='Years Active'),
        alt.Tooltip('total_count:Q', title='Total Opinions Authored'),
        alt.Tooltip('min_score_fre:Q', title='Min Score'),
        alt.Tooltip('max_score_fre:Q', title='Max Score'),
        alt.Tooltip('avg_score_fre:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of FRE Scores by Author',
    width=100,
    height=400
)

# Create a violin plot for Opinion 1 F-K scores by authors
violin_plot_fk = alt.Chart(long_df).transform_filter(
    alt.FieldOneOfPredicate(field='Author(s)', oneOf=distinct_authors)
).transform_density(
    density='F-K Score',
    as_=['F-K Score', 'density'],
    extent=[long_df['F-K Score'].min(), long_df['F-K Score'].max()],
    groupby=['Author(s)']
).transform_lookup(
    lookup='Author(s)',
    from_=alt.LookupData(author_summary, 'Author(s)', ['Year_Band', 'total_count', 'min_score_fk', 'max_score_fk', 'avg_score_fk'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('F-K Score:Q'),
    alt.Color('Author(s):N', scale=alt.Scale(scheme='category20')),  # Use a color scheme with enough distinct colors
    alt.Column('Author(s):N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('Author(s):N', title='Author'),
        alt.Tooltip('Year_Band:N', title='Years Active'),
        alt.Tooltip('total_count:Q', title='Total Opinions Authored'),
        alt.Tooltip('min_score_fk:Q', title='Min Score'),
        alt.Tooltip('max_score_fk:Q', title='Max Score'),
        alt.Tooltip('avg_score_fk:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of F-K Scores by Author',
    width=100,
    height=400
)

# Concatenate the violin plots vertically and configure the view at the top level
violin_plots = alt.vconcat(violin_plot_fre, violin_plot_fk).configure_view(
    stroke=None
)

# Display the violin plots
violin_plots

In [88]:
# Calculate summary statistics for Controversial and Non-Controversial cases (FRE)
aggregated_fre = df.groupby(['Controversial'])['Opinion 1 FRE Score'].agg(
    total_count='size',
    min_score='min',
    max_score='max',
    avg_score='mean'
).reset_index()

# Calculate summary statistics for Controversial and Non-Controversial cases (F-K)
aggregated_fk = df.groupby(['Controversial'])['Opinion 1 F-K Score'].agg(
    total_count='size',
    min_score='min',
    max_score='max',
    avg_score='mean'
).reset_index()

# Create a violin plot for Opinion 1 FRE scores by Controversial flag
violin_plot_controversial_fre = alt.Chart(df).transform_density(
    density='Opinion 1 FRE Score',
    as_=['Opinion 1 FRE Score', 'density'],
    extent=[df['Opinion 1 FRE Score'].min(), df['Opinion 1 FRE Score'].max()],
    groupby=['Controversial']
).transform_lookup(
    lookup='Controversial',
    from_=alt.LookupData(aggregated_fre, 'Controversial', ['total_count', 'min_score', 'max_score', 'avg_score'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('Opinion 1 FRE Score:Q'),
    alt.Color('Controversial:N', scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#d62728'])),  # Blue for 0, Red for 1
    alt.Column('Controversial:N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('Controversial:N', title='Controversial'),
        alt.Tooltip('total_count:Q', title='Total Count'),
        alt.Tooltip('min_score:Q', title='Min Score'),
        alt.Tooltip('max_score:Q', title='Max Score'),
        alt.Tooltip('avg_score:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of Opinion 1 FRE Scores by Controversial Flag',
    width=100,
    height=400
)

# Create a violin plot for Opinion 1 F-K scores by Controversial flag
violin_plot_controversial_fk = alt.Chart(df).transform_density(
    density='Opinion 1 F-K Score',
    as_=['Opinion 1 F-K Score', 'density'],
    extent=[df['Opinion 1 F-K Score'].min(), df['Opinion 1 F-K Score'].max()],
    groupby=['Controversial']
).transform_lookup(
    lookup='Controversial',
    from_=alt.LookupData(aggregated_fk, 'Controversial', ['total_count', 'min_score', 'max_score', 'avg_score'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('Opinion 1 F-K Score:Q'),
    alt.Color('Controversial:N', scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#d62728'])),  # Blue for 0, Red for 1
    alt.Column('Controversial:N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('Controversial:N', title='Controversial'),
        alt.Tooltip('total_count:Q', title='Total Count'),
        alt.Tooltip('min_score:Q', title='Min Score'),
        alt.Tooltip('max_score:Q', title='Max Score'),
        alt.Tooltip('avg_score:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of Opinion 1 F-K Scores by Controversial Flag',
    width=100,
    height=400
)

# Concatenate the violin plots for controversial flag vertically and configure the view at the top level
violin_plots_controversial = alt.hconcat(
    violin_plot_controversial_fre, 
    violin_plot_controversial_fk
).configure_view(
    stroke=None
)

# Display both visualizations side by side
violin_plots_controversial.display()

In [89]:
aggregated_fre = long_df.groupby(['Controversial'])['FRE Score'].agg(
    total_count='size',
    min_score='min',
    max_score='max',
    avg_score='mean'
).reset_index()

# Calculate summary statistics for Controversial and Non-Controversial cases (F-K)
aggregated_fk = long_df.groupby(['Controversial'])['F-K Score'].agg(
    total_count='size',
    min_score='min',
    max_score='max',
    avg_score='mean'
).reset_index()

# Create a violin plot for FRE scores by Controversial flag
violin_plot_controversial_fre = alt.Chart(long_df).transform_filter(
    alt.FieldOneOfPredicate(field='Controversial', oneOf=[0, 1])
).transform_density(
    density='FRE Score',
    as_=['FRE Score', 'density'],
    extent=[long_df['FRE Score'].min(), long_df['FRE Score'].max()],
    groupby=['Controversial']
).transform_lookup(
    lookup='Controversial',
    from_=alt.LookupData(aggregated_fre, 'Controversial', ['total_count', 'min_score', 'max_score', 'avg_score'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('FRE Score:Q'),
    alt.Color('Controversial:N',
        scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#d62728']),  # Blue for 0, Red for 1
        legend=alt.Legend(
            title="Controversy",
            values=[0, 1],
            labelExpr="datum.value == 0 ? 'Not Controversial' : 'Controversial'"
        )
    ),
    alt.Column('Controversial:N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('total_count:Q', title='Total Count'),
        alt.Tooltip('min_score:Q', title='Min Score'),
        alt.Tooltip('max_score:Q', title='Max Score'),
        alt.Tooltip('avg_score:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of FRE Scores by Controversial Flag',
    width=100,
    height=400
)

# Create a violin plot for F-K scores by Controversial flag
violin_plot_controversial_fk = alt.Chart(long_df).transform_filter(
    alt.FieldOneOfPredicate(field='Controversial', oneOf=[0, 1])
).transform_density(
    density='F-K Score',
    as_=['F-K Score', 'density'],
    extent=[long_df['F-K Score'].min(), long_df['F-K Score'].max()],
    groupby=['Controversial']
).transform_lookup(
    lookup='Controversial',
    from_=alt.LookupData(aggregated_fk, 'Controversial', ['total_count', 'min_score', 'max_score', 'avg_score'])
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('F-K Score:Q'),
    alt.Color('Controversial:N',
        scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#d62728']),  # Blue for 0, Red for 1
        legend=alt.Legend(
            title="Controversy",
            values=[0, 1],
            labelExpr="datum.value == 0 ? 'Not Controversial' : 'Controversial'"
        )
    ),
    alt.Column('Controversial:N')
        .spacing(10)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0),
    tooltip=[
        alt.Tooltip('total_count:Q', title='Total Count'),
        alt.Tooltip('min_score:Q', title='Min Score'),
        alt.Tooltip('max_score:Q', title='Max Score'),
        alt.Tooltip('avg_score:Q', title='Avg Score')
    ]
).properties(
    title='Violin Plot of F-K Scores by Controversial Flag',
    width=100,
    height=400
)

# Concatenate the violin plots for controversial flag vertically and configure the view at the top level
violin_plots_controversial = alt.hconcat(
    violin_plot_controversial_fre, 
    violin_plot_controversial_fk
).configure_view(
    stroke=None
)

# Display both visualizations side by side
violin_plots_controversial.display()
# Check if opinions are really simple or not (The little bubble at the bottom)

In [90]:
# Aggregate the data to get the total case count for each author, split by controversial and non-controversial cases
author_case_stats = long_df.groupby(['Author(s)', 'Controversial']).agg(
    case_count=('F-K Score', 'size'),
    mean_fk_score=('F-K Score', 'mean'),
    min_fk_score=('F-K Score', 'min'),
    max_fk_score=('F-K Score', 'max'),
    mean_fre_score=('FRE Score', 'mean'),
    min_fre_score=('FRE Score', 'min'),
    max_fre_score=('FRE Score', 'max')
).reset_index()

# Create the stacked bar chart
stacked_bar_chart = alt.Chart(author_case_stats).mark_bar().encode(
    x=alt.X('Author(s):N', title='Author', sort=alt.EncodingSortField(field='case_count', op='sum', order='descending')),
    y=alt.Y('case_count:Q', title='Total Case Count'),
    color=alt.Color('Controversial:N', title='Controversial', scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#d62728']),
        legend=alt.Legend(
            title="Controversy",
            values=[0, 1],
            labelExpr="datum.value == 0 ? 'Not Controversial' : 'Controversial'"
        )
    ),
    tooltip=[
        alt.Tooltip('Author(s):N', title='Author'),
        alt.Tooltip('case_count:Q', title='Case Count'),
        alt.Tooltip('Controversial:N', title='Controversial'),
        alt.Tooltip('mean_fk_score:Q', title='Mean F-K Score'),
        alt.Tooltip('min_fk_score:Q', title='Min F-K Score'),
        alt.Tooltip('max_fk_score:Q', title='Max F-K Score'),
        alt.Tooltip('mean_fre_score:Q', title='Mean FRE Score'),
        alt.Tooltip('min_fre_score:Q', title='Min FRE Score'),
        alt.Tooltip('max_fre_score:Q', title='Max FRE Score')
    ]
).properties(
    title='Total Case Count by Author (Stacked by Controversial and Non-Controversial Cases)',
    width=800,
    height=400
).configure_axis(
    labelAngle=-45
)

# Display the chart
stacked_bar_chart.display()

In [91]:
#Average FRE and F-K scores per year per author
single_avg = (
    long_df.groupby(['Year', 'Author(s)'])
    .agg(
        FRE_Score=('FRE Score', 'mean'),
        FK_Score=('F-K Score', 'mean')
    )
    .reset_index()
)

justice_selection = alt.selection_point(fields=['Author(s)'])

#Nearest tooltip
nearest = alt.selection(
    type='single',
    nearest=True,
    on='mouseover',
    fields=['Year', 'Author(s)'],
    empty='none'
)

#Highlight
highlight_color = alt.condition(
    justice_selection,
    alt.Color('Author(s):N', legend=None, scale=alt.Scale(scheme='category10')),
    alt.value('lightgray')
)

#FRE Chart
line_chart_fre = alt.Chart(single_avg).mark_line().encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('FRE_Score:Q', title='FRE Score', scale=alt.Scale(domain=[35, 75])),
    color=highlight_color
).properties(
    title='Line Chart of FRE Scores by Author Over Time',
    width=600,
    height=400
).add_params(justice_selection)

#FRE tooltips
fre_tooltip = alt.Chart(single_avg).mark_point(size=50, opacity=0).encode(
    x='Year:O',
    y='FRE_Score:Q',
    tooltip=[
        'Year:O', 
        'FRE_Score:Q', 
        'Author(s):N'
    ]
).add_params(nearest)

fre_combined_chart = line_chart_fre + fre_tooltip

#FRE legend
fre_legend = alt.Chart(single_avg).mark_point().encode(
    y=alt.Y('Author(s):N', axis=alt.Axis(title='Author(s)', labelLimit=100)),
    color=highlight_color
).properties(
    width=50
).add_params(justice_selection)

#F-K Chart
line_chart_fk = alt.Chart(single_avg).mark_line().encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('FK_Score:Q', title='F-K Score', scale=alt.Scale(domain=[4, 12])),
    color=highlight_color
).properties(
    title='Line Chart of F-K Scores by Author Over Time',
    width=600,
    height=400
).add_params(justice_selection)

#F-K tooltip
fk_tooltip = alt.Chart(single_avg).mark_point(size=50, opacity=0).encode(
    x='Year:O',
    y='FK_Score:Q',
    tooltip=[
        'Year:O', 
        'FK_Score:Q', 
        'Author(s):N'
    ]
).add_params(nearest)

fk_combined_chart = line_chart_fk + fk_tooltip

#F-K legend
fk_legend = alt.Chart(single_avg).mark_point().encode(
    y=alt.Y('Author(s):N', axis=alt.Axis(title='Author(s)', labelLimit=100)),
    color=highlight_color
).properties(
    width=50
).add_params(justice_selection)

#Combine
fre_combined = alt.hconcat(fre_combined_chart, fre_legend).resolve_scale(color='independent')
fk_combined = alt.hconcat(fk_combined_chart, fk_legend).resolve_scale(color='independent')

line_charts = alt.vconcat(fre_combined, fk_combined).configure_view(
    stroke=None
)

#Display
line_charts

These functions also include more helpful docstrings.
  nearest = alt.selection(


In [92]:
### Combine all opinions and authors into respective columns
# Concatenate data for all opinions
opinion_columns = [col for col in df.columns if 'Opinion' in col and 'Score' in col]
author_columns = [col for col in df.columns if 'Opinion' in col and 'Author(s)' in col]
type_columns = [col for col in df.columns if 'Opinion' in col and 'Type' in col]
# Ensure the Year column remains throughout this transformation
long_df_2 = pd.DataFrame()
for i in range(1, 10):  # Assuming there are up to 9 opinions
    fre_col = f'Opinion {i} FRE Score'
    fk_col = f'Opinion {i} F-K Score'
    author_col = f'Opinion {i} Author(s)'
    type_col = f'Opinion {i} Type'
    if fre_col in df.columns and fk_col in df.columns and author_col in df.columns and type_col in df.columns:
        temp_df = df[['Year', fre_col, fk_col, author_col, type_col, 'Controversial']].copy()
        temp_df.columns = ['Year', 'FRE Score', 'F-K Score', 'Author(s)', 'Type', 'Controversial']
        temp_df['Opinion'] = f'Opinion {i}'
        long_df_2 = pd.concat([long_df_2, temp_df], ignore_index=True)

long_df_2['Author(s)'] = long_df_2['Author(s)'].str.split(',').str[0]
long_df_2['Author(s)'] = long_df_2['Author(s)'].apply(lambda x: x.split(' and ')[0])

#Drop all null authors
long_df_2 = long_df_2.dropna()
# Update messy Types to 'concurring' or 'dissenting'
long_df_2['Type'] = long_df_2['Type'].replace({
    '_x000D_\nconcurring': 'concurring',
    '_x000D_\ndissenting': 'dissenting'
})

# Print rows where Type isn't dissenting, concurring, or both
invalid_types = long_df_2[~long_df_2['Type'].isin(['dissenting', 'concurring', 'concurring and dissenting'])]
long_df_2 = long_df_2.drop(invalid_types.index)

# Drop rows where Type is 'concurring and dissenting'
long_df_2 = long_df_2[long_df_2['Type'] != 'concurring and dissenting']

# Verify the update
print(long_df_2['Type'].value_counts())


Type
dissenting    1078
concurring     971
Name: count, dtype: int64


In [93]:
import altair as alt

# Create the heatmap for FRE scores
heatmap_fre = alt.Chart(long_df_2).mark_rect().encode(
    x=alt.X('Author(s):N', title='Author', sort='ascending'),
    y=alt.Y('Type:N', title='Type'),
    color=alt.Color('mean(FRE Score):Q', title='Average FRE Score', scale=alt.Scale(range=['brown', 'lime'])),
    tooltip=[
        alt.Tooltip('Author(s):N', title='Author'),
        alt.Tooltip('Type:N', title='Type'),
        alt.Tooltip('mean(FRE Score):Q', title='Average FRE Score')
    ]
).properties(
    title='Heatmap of Average FRE Scores by Opinion Type and Author',
    width=800,
    height=400
).configure_axis(
    labelAngle=-45
)

# Create the heatmap for F-K scores
heatmap_fk = alt.Chart(long_df_2).mark_rect().encode(
    x=alt.X('Author(s):N', title='Author', sort='ascending'),
    y=alt.Y('Type:N', title='Type'),
    color=alt.Color('mean(F-K Score):Q', title='Average F-K Score', scale=alt.Scale(range=['lime', 'brown'])),
    tooltip=[
        alt.Tooltip('Author(s):N', title='Author'),
        alt.Tooltip('Type:N', title='Type'),
        alt.Tooltip('mean(F-K Score):Q', title='Average F-K Score')
    ]
).properties(
    title='Heatmap of Average F-K Scores by Opinion Type and Author',
    width=800,
    height=400
).configure_axis(
    labelAngle=-45
)

# Display the heatmaps
heatmap_fre.display()
heatmap_fk.display()

In [94]:
# Read the Excel file into a DataFrame
additional_scores = pd.read_excel('Other Scores.xlsx')
# Remove the first row in the dataset
additional_scores = additional_scores.iloc[1:]
# Print out the data
print(additional_scores)

             Source        FRE        F-K
1  King James Bible  73.975645  10.301877
2      Harry Potter  83.314779   4.786345


In [100]:
# Update the y-axis ranges to include additional scores
fk_min = min(avg_scores['F-K Score'].min(), additional_scores['F-K'].min())
fk_max = max(avg_scores['F-K Score'].max(), additional_scores['F-K'].max())
fk_range = [fk_min - (fk_max - fk_min) * 0.1, fk_max + (fk_max - fk_min) * 0.1]

fre_min = min(avg_scores['FRE Score'].min(), additional_scores['FRE'].min())
fre_max = max(avg_scores['FRE Score'].max(), additional_scores['FRE'].max())
fre_range = [fre_min - (fre_max - fre_min) * 0.1, fre_max + (fre_max - fre_min) * 0.1]

# Create horizontal lines for F-K scores with labels
fk_lines = alt.Chart(additional_scores).mark_rule(color='green', strokeWidth=3).encode(
    y=alt.Y('F-K:Q', title='Average F-K Score', scale=alt.Scale(domain=fk_range)),
    tooltip=[
        alt.Tooltip('Source:N', title='Source'),
        alt.Tooltip('F-K:Q', title='F-K Score')
    ]
)

fk_labels = alt.Chart(additional_scores).mark_text(
    align='left', dx=5, dy=-5, color='green'
).encode(
    y=alt.Y('F-K:Q', title='Average F-K Score', scale=alt.Scale(domain=fk_range)),
    text='Source:N'
)

# Create horizontal lines for FRE scores with labels
fre_lines = alt.Chart(additional_scores).mark_rule(color='green', strokeWidth=3).encode(
    y=alt.Y('FRE:Q', title='Average FRE Score', scale=alt.Scale(domain=fre_range)),
    tooltip=[
        alt.Tooltip('Source:N', title='Source'),
        alt.Tooltip('FRE:Q', title='FRE Score')
    ]
)

fre_labels = alt.Chart(additional_scores).mark_text(
    align='left', dx=5, dy=-5, color='green'
).encode(
    y=alt.Y('FRE:Q', title='Average FRE Score', scale=alt.Scale(domain=fre_range)),
    text='Source:N'
)

# Combine the horizontal lines and labels with the existing FRE chart
fre_chart = alt.layer(fre_line, fre_points, fre_lines, fre_labels).properties(
    title='Average FRE Score per Year',
    width=700,
    height=400
)

# Combine the horizontal lines and labels with the existing F-K chart
fk_chart = alt.layer(fk_line, fk_points, fk_lines, fk_labels).properties(
    title='Average F-K Score per Year',
    width=700,
    height=400
)

# Display the combined charts
fre_chart & fk_chart