In [1]:
import pandas as pd
import re
from tqdm.auto import tqdm

In [2]:
df = pd.read_csv("cnn-8.csv.gz", compression='gzip')
df.columns

Index(['url', 'channel.name', 'program.name', 'uid', 'duration', 'year',
       'month', 'date', 'time', 'timezone', 'path', 'wordcount', 'subhead',
       'text'],
      dtype='object')

In [22]:
import re
import pandas as pd

def create_federal_judge_patterns():
    """
    Create comprehensive regex patterns for detecting mentions of federal judge types.
    
    Returns:
    --------
    list: Regex patterns for different federal judge types
    """
    federal_judge_patterns = [
        # Supreme Court Justices
        r'\b(Supreme Court\s+Justice(s)?)\b',
        r'\b(Supreme Court\s+Judge(s)?)\b',
        
        # Circuit Court Judges
        r'\b(Circuit Court\s+Judge(s)?)\b',
        r'\b(Appellate Court\s+Judge(s)?)\b',
        r'\b(Federal\s+Appellate\s+Judge(s)?)\b',
        
        # District Court Judges
        r'\b(District Court\s+Judge(s)?)\b',
        r'\b(Federal\s+District\s+Judge(s)?)\b',
        
        # Specialized Courts
        r'\b(Court of International Trade\s+Judge(s)?)\b',
        r'\b(Court of Federal Claims\s+Judge(s)?)\b',
        
        # Bankruptcy Judges
        r'\b(Bankruptcy\s+Judge(s)?)\b',
        
        # Magistrate Judges
        r'\b(Magistrate\s+Judge(s)?)\b'
    ]
    
    return [re.compile(pattern, re.IGNORECASE) for pattern in federal_judge_patterns]

def filter_federal_judge_articles(df, text_column='text'):
    """
    Filter articles mentioning specific types of federal judges.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame containing text data
    text_column : str, optional
        Name of the column containing text (default is 'text')
    
    Returns:
    --------
    tuple: (filtered DataFrame, filtering statistics)
    """
    # Create federal judge patterns
    federal_judge_patterns = create_federal_judge_patterns()
    
    def detect_federal_judge_type(text):
        """
        Detect and return matched federal judge types in the text.
        """
        if not isinstance(text, str):
            return []
        
        matched_types = []
        for pattern in federal_judge_patterns:
            matches = pattern.findall(text)
            if matches:
                matched_types.extend(matches)
        
        return list(set(matched_types))
    
    # Apply federal judge type detection
    df['federal_judge_types'] = df[text_column].apply(detect_federal_judge_type)
    
    # Filter to articles mentioning federal judge types
    federal_judge_df = df[df['federal_judge_types'].apply(len) > 0].copy()
    
    # Prepare filtering statistics
    filtering_stats = {
        'total_articles': len(df),
        'federal_judge_articles': len(federal_judge_df),
        'federal_judge_type_breakdown': (
            federal_judge_df['federal_judge_types']
            .explode()
            .value_counts()
            .to_dict()
        )
    }
    
    return federal_judge_df, filtering_stats


judicial_df, stats = filter_federal_judge_articles(df, text_column='text')

In [23]:
judicial_df.shape

(2955, 15)

In [None]:
judicial_df.to_csv("judicial_articles.csv", index=False)

In [71]:
import re
import pandas as pd

def analyze_judicial_political_context(judicial_df, text_column='text'):
    presidents = ['Biden', 'Trump', 'Obama', 'Bush', 'Clinton', 'Reagan', 'Carter']
    
    patterns = [
        (r'\b(appointed by|nominated by|selected by)\s+((?:President\s+)?(?:Biden|Trump|Obama|Bush|Clinton|Reagan|Carter))\b', 'appointment'),
        (r'\b(Democrat|Republican|GOP)\s+(nominated|supported)\s+(judge|justice)\b', 'party'),
        (r'\b(liberal|conservative|progressive|originalist|textualist)\s+(judicial?\s+)?(?:judge|justice|interpretation)\b', 'ideology'),
        (r'\b((?:Biden|Trump|Obama|Bush|Clinton|Reagan|Carter)\s+administration(\'s)?)\s+(judicial\s+)?nomination(s)?\b', 'administration'),
        (r'\b(Senate\s+confirmation)\s+of\s+(judicial\s+)?nominee(s)?\b', 'selection')
    ]
    
    compiled_patterns = []
    for pattern, category in patterns:
        compiled_pattern = re.compile(pattern, re.IGNORECASE)
        compiled_patterns.append((compiled_pattern, category))
    
    def extract_political_context(text):
        context = {}
        if not isinstance(text, str):
            return context
        
        for pattern, category in compiled_patterns:
            matches = pattern.findall(text)
            if matches:
                context[category] = {
                    'matches': matches,
                    'category': f'{category}_context'
                }
        
        return context
    
    def extract_presidents(context_dict):
        found_presidents = set()
        if not context_dict:
            return []
        
        for context_info in context_dict.values():
            for match in context_info.get('matches', []):
                for name in presidents:
                    if any(name.lower() in str(m).lower() for m in match):
                        found_presidents.add(name)
        
        return list(found_presidents)
    
    judicial_df['political_context'] = judicial_df[text_column].apply(extract_political_context)
    judicial_df['has_political_context'] = judicial_df['political_context'].apply(bool)
    judicial_df['mentioned_presidents'] = judicial_df['political_context'].apply(extract_presidents)
    
    political_context_stats = {
        'total_judicial_articles': len(judicial_df),
        'articles_with_political_context': len(judicial_df[judicial_df['has_political_context']]),
        'president_mentions': judicial_df['mentioned_presidents'].explode().value_counts().to_dict()
    }
    
    return judicial_df, political_context_stats

In [72]:
judicial_df_with_context, political_stats = analyze_judicial_political_context(judicial_df)
    
    # Display results
print("Articles with Political Context:")
context_df = judicial_df_with_context[judicial_df_with_context['has_political_context']]
print(context_df[['text', 'mentioned_presidents', 'political_context']])

Articles with Political Context:
                                                    text mentioned_presidents  \
295    [21:00:00] JOHN BERMAN, CNN HOST: According to...                   []   
799    [12:30:00] JOHN KING, CNN HOST: And so to that...                   []   
913    (COMMERCIAL BREAK) [10:00:00] JIM SCIUTTO, CNN...                   []   
939    JAMES CLAPPER, FORMER DIRECTOR OF NATIONAL INT...                   []   
1865   [18:00:03] WOLF BLITZER, CNN HOST: Russia's as...                   []   
...                                                  ...                  ...   
42472  ERIN BURNETT, CNN HOST: Thank you. ADRIAN SIMA...              [Biden]   
42688  [18:00:00] WOLF BLITZER, CNN ANCHOR:  Happenin...              [Trump]   
42765  [18:00:00] JAKE TAPPER, CNN ANCHOR:  -- March ...                   []   
42981  KAREN FINNEY, CNN POLITICAL COMMENTATOR:  -- c...                   []   
43335  JAKE TAPPER, CNN HOST: Welcome to The Lead. I'...                   [

In [73]:
print("\nPolitical Context Statistics:")
for key, value in political_stats.items():
    print(f"{key}: {value}")


Political Context Statistics:
total_judicial_articles: 2955
articles_with_political_context: 170
president_mentions: {'Trump': 59, 'Biden': 6, 'Obama': 4, 'Reagan': 2, 'Bush': 1, 'Carter': 1}


In [74]:
judicial_df_with_context.columns

Index(['url', 'channel.name', 'program.name', 'uid', 'duration', 'year',
       'month', 'date', 'time', 'timezone', 'path', 'wordcount', 'subhead',
       'text', 'federal_judge_types', 'political_context',
       'has_political_context', 'mentioned_presidents'],
      dtype='object')

In [75]:
presidential_judicial_df = judicial_df[judicial_df['mentioned_presidents'].apply(len) > 0].copy()

In [76]:
presidential_judicial_df['mentioned_presidents'].shape

(71,)