<a href="https://colab.research.google.com/github/nethranatarajan3/nethranatarajan3.github.io/blob/main/portfolio_code/CC5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
#Importing Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime



In [34]:
# URL for Supreme Court Justices
url = "https://en.wikipedia.org/wiki/List_of_justices_of_the_Supreme_Court_of_the_United_States"

print("Fetching data from Wikipedia...")

# Adding headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Getting the page with headers
response = requests.get(url, headers=headers)



Fetching data from Wikipedia...


In [35]:
# Finding the main table with justice information
# Using pandas read_html with headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

tables = pd.read_html(url, storage_options={'User-Agent': headers['User-Agent']})




In [36]:
# Selecting the table with justice data

df = tables[1].copy()


print(df.columns.tolist())

print(df.head())

[('Justice', 'No.'), ('Justice', 'Portrait'), ('Justice', 'Name (birth–death)'), ('State[c]', 'State[c]'), ('Position', 'Position'), ('Succeeded', 'Succeeded'), ('Date confirmed (Vote)', 'Date confirmed (Vote)'), ('Tenure', 'Tenure'), ('Tenure length[d]', 'Tenure length[d]'), ('Nominated by', 'Nominated by')]
  Justice                                       State[c]           Position  \
      No. Portrait           Name (birth–death) State[c]           Position   
0       1      NaN         John Jay (1745–1829)       NY      Chief Justice   
1       2      NaN    John Rutledge (1739–1800)       SC  Associate Justice   
2       3      NaN  William Cushing (1732–1810)       MA  Associate Justice   
3       4      NaN     James Wilson (1742–1798)       PA  Associate Justice   
4       5      NaN       John Blair (1732–1800)       VA  Associate Justice   

   Succeeded             Date confirmed (Vote)  \
   Succeeded             Date confirmed (Vote)   
0  Inaugural  September 26, 1789 (A

In [37]:
# Flattening MultiIndex columns if present
if isinstance(df.columns, pd.MultiIndex):

    df.columns = [' '.join(col).strip() if col[1] else col[0] for col in df.columns.values]

# Cleaning column names - remove extra spaces, special characters
df.columns = df.columns.str.strip()


print(df.columns.tolist())

# Renaming columns to simpler names
column_mapping = {
    'Justice': 'justice_name',
    'State[c]': 'state',
    'Position': 'position',
    'Succeeded': 'succeeded',
    'Date confirmed (Vote)': 'date_confirmed',
    'Date confirmed': 'date_confirmed',
    'Tenure': 'tenure',
    'Tenure length[d]': 'tenure_length',
    'Nominated by': 'nominated_by'
}

# Applying mapping for columns that exist
for old_name in df.columns:
    for pattern, new_name in column_mapping.items():
        if pattern in old_name:
            df = df.rename(columns={old_name: new_name})
            break


print(df.columns.tolist())

print(df.head())

['Justice No.', 'Justice Portrait', 'Justice Name (birth–death)', 'State[c] State[c]', 'Position Position', 'Succeeded Succeeded', 'Date confirmed (Vote) Date confirmed (Vote)', 'Tenure Tenure', 'Tenure length[d] Tenure length[d]', 'Nominated by Nominated by']
['justice_name', 'justice_name', 'justice_name', 'state', 'position', 'succeeded', 'date_confirmed', 'tenure', 'tenure', 'nominated_by']
  justice_name justice_name                 justice_name state  \
0            1          NaN         John Jay (1745–1829)    NY   
1            2          NaN    John Rutledge (1739–1800)    SC   
2            3          NaN  William Cushing (1732–1810)    MA   
3            4          NaN     James Wilson (1742–1798)    PA   
4            5          NaN       John Blair (1732–1800)    VA   

            position  succeeded                    date_confirmed  \
0      Chief Justice  Inaugural  September 26, 1789 (Acclamation)   
1  Associate Justice  Inaugural  September 26, 1789 (Acclamation)  

In [38]:
# Accessing the first tenure column with dates

print(df['tenure'].columns.tolist())

print(df['tenure'].head())

# Getting the first sub-column (which has the dates)
tenure_dates = df['tenure'].iloc[:, 0]  # First sub-column with dates

print("\nSample tenure dates:")
print(tenure_dates.head())

# Function to extract start and end years from tenure string
def extract_service_years(tenure_str):
    try:
        # Convert to string first
        tenure_str = str(tenure_str)

        if tenure_str == 'nan' or not tenure_str:
            return None, None

        # Looking for year patterns like
        # Extracting all 4-digit years
        years = re.findall(r'(\d{4})', tenure_str)

        if len(years) >= 1:
            start_year = int(years[0])
            end_year = int(years[1]) if len(years) >= 2 else 2025  # If only one year or "present"
            return start_year, end_year

        return None, None
    except Exception as e:
        print(f"Error processing: {tenure_str} - {e}")
        return None, None

# Applying to dataframe
df['start_year'] = None
df['end_year'] = None

for idx in df.index:
    start, end = extract_service_years(tenure_dates.loc[idx])
    df.at[idx, 'start_year'] = start
    df.at[idx, 'end_year'] = end



# Show results
if 'justice_name' in df.columns:
    print(df[['justice_name', 'start_year', 'end_year']].head(10))
else:
    print(df[['start_year', 'end_year']].head(10))

['tenure', 'tenure']
                                              tenure              tenure
0        October 19, 1789 – June 29, 1795 (Resigned)   5 years, 253 days
1    February 15, 1790[e] – March 5, 1791 (Resigned)     1 year, 18 days
2    February 2, 1790[e] – September 13, 1810 (Died)  20 years, 223 days
3        October 5, 1789[e] – August 21, 1798 (Died)   8 years, 320 days
4  February 2, 1790[e] – October 25, 1795 (Resigned)   5 years, 265 days

Sample tenure dates:
0          October 19, 1789 – June 29, 1795 (Resigned)
1      February 15, 1790[e] – March 5, 1791 (Resigned)
2      February 2, 1790[e] – September 13, 1810 (Died)
3          October 5, 1789[e] – August 21, 1798 (Died)
4    February 2, 1790[e] – October 25, 1795 (Resigned)
Name: tenure, dtype: object
  justice_name justice_name                  justice_name start_year end_year
0            1          NaN          John Jay (1745–1829)       1789     1795
1            2          NaN     John Rutledge (1739–1800)   

In [39]:
# Cleaning the nominated_by column
if 'nominated_by' in df.columns:
    # The nominated_by column should contain president names
    df['president'] = df['nominated_by'].astype(str).str.strip()

    # Removing any extra text in parentheses or brackets
    df['president'] = df['president'].str.replace(r'\[.*?\]', '', regex=True)
    df['president'] = df['president'].str.replace(r'\(.*?\)', '', regex=True)
    df['president'] = df['president'].str.strip()



In [40]:
# Removing duplicate columns and NaN columns
print("Current columns:", df.columns.tolist())

# Dropping columns that are all NaN or duplicate
df = df.loc[:, ~df.columns.duplicated()]

# Keeping only the columns with data
if df.columns.tolist().count('justice_name') > 1:
    # Getting all justice_name columns
    justice_cols = [col for col in df.columns if col == 'justice_name']

    # Finding which one has the actual data (not all NaN)
    for col_idx, col in enumerate(justice_cols):
        if df[col].notna().sum() > 0:

            if col_idx > 0:
                cols_to_keep = [c for c in df.columns if c != 'justice_name']
                df = df[cols_to_keep + [col]]
                df = df.rename(columns={col: 'justice_name'})
            break



Current columns: ['justice_name', 'justice_name', 'justice_name', 'state', 'position', 'succeeded', 'date_confirmed', 'tenure', 'tenure', 'nominated_by', 'start_year', 'end_year', 'president']


In [41]:
# Calculating years served from start and end years
if 'start_year' in df.columns and 'end_year' in df.columns:
    df['years_served'] = df['end_year'] - df['start_year']



In [42]:
# Removing rows with missing critical data
required_columns = ['justice_name', 'start_year']
existing_required = [col for col in required_columns if col in df.columns]

df_clean = df.dropna(subset=existing_required)

# Removing rows where start_year is None
df_clean = df_clean[df_clean['start_year'].notna()]

print(f"Original rows: {len(df)}")
print(f"After removing incomplete records: {len(df_clean)}")


Original rows: 121
After removing incomplete records: 121


In [43]:
# Creating tidy format: one row per justice
# Selecting key columns
tidy_columns = ['justice_name', 'start_year', 'end_year', 'years_served',
                'president', 'state', 'position']

# Only including columns that exist
tidy_columns = [col for col in tidy_columns if col in df_clean.columns]

tidy_df = df_clean[tidy_columns].copy()

# Converting years to integers
if 'start_year' in tidy_df.columns:
    tidy_df['start_year'] = tidy_df['start_year'].astype(int)
if 'end_year' in tidy_df.columns:
    tidy_df['end_year'] = tidy_df['end_year'].astype(int)
if 'years_served' in tidy_df.columns:
    tidy_df['years_served'] = tidy_df['years_served'].astype(int)

# Sorting by start year
if 'start_year' in tidy_df.columns:
    tidy_df = tidy_df.sort_values('start_year')



In [44]:
# Creating decade column
if 'start_year' in tidy_df.columns:
    tidy_df['decade'] = (tidy_df['start_year'] // 10) * 10

    # Counting justices appointed per decade
    justices_by_decade = tidy_df.groupby('decade').size().reset_index(name='num_justices')



In [45]:
# Counting justices appointed by each president
if 'president' in tidy_df.columns:
    justices_by_president = tidy_df.groupby('president').size().reset_index(name='num_justices')
    justices_by_president = justices_by_president.sort_values('num_justices', ascending=False)



In [46]:
# Exporting main tidy dataset
tidy_df.to_csv('supreme_court_justices_tidy.csv', index=False)


# Exporting aggregated datasets
if 'decade' in tidy_df.columns:
    justices_by_decade.to_csv('justices_by_decade.csv', index=False)

if 'president' in tidy_df.columns:
    justices_by_president.to_csv('justices_by_president.csv', index=False)




In [47]:
# Downloading files
try:
    from google.colab import files

    files.download('supreme_court_justices_tidy.csv')
    files.download('justices_by_decade.csv')
    files.download('justices_by_president.csv')


except Exception as e:
    print(f"Not in Colab environment or download error: {e}")
    print("Files saved to current directory.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>