In [1]:
import re
import spacy
import pandas as pd

# Load spaCy language model
nlp = spacy.load("en_core_web_sm")


In [3]:

# Step 1: Custom Adjustments
def custom_adjustments(text):
    """Custom text adjustments based on repository preprocessing."""
    # Remove hyphens and join hyphenated words
    text = re.sub(r'(?<!\w)-|-(?!\w)', '', text)
    
    # Normalize possessives (e.g., "immigrant's" -> "immigrant")
    text = re.sub(r"'s\b", "", text)
    
    # Remove speaker names (if they exist in your dataset)
    text = re.sub(r"^\[.*?\]:", "", text)  # Assuming speaker names are bracketed, e.g., "[Speaker]:"
    
    return text

# Step 2: Advanced Cleaning
def advanced_clean_text(text):
    """Perform advanced text cleaning."""
    # Handle missing values
    if not isinstance(text, str):
        return ""
    
    # Replace unwanted characters (e.g., numbers, punctuation, etc.)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'\s+', ' ', text)     # Replace multiple spaces with a single space
    text = text.strip().lower()          # Convert to lowercase
    
    return text

# Step 3: Tokenization Using spaCy
def tokenize_with_spacy(text):
    """Tokenize text using spaCy."""
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return tokens

# Complete Preprocessing Pipeline
def preprocess_pipeline(text):
    """Complete preprocessing pipeline."""
    # Step 1: Custom adjustments
    adjusted_text = custom_adjustments(text)
    
    # Step 2: Advanced cleaning
    cleaned_text = advanced_clean_text(adjusted_text)
    
    # Step 3: Tokenization
    tokens = tokenize_with_spacy(cleaned_text)
    
    return cleaned_text, tokens


In [5]:
# Load the dataset
file_path = 'newdata/cleaned_data_7.csv'
df = pd.read_csv(file_path)


# Apply preprocessing pipeline
df[['cleaned_contents', 'tokens']] = df['contents'].fillna("").apply(
    lambda x: pd.Series(preprocess_pipeline(x))
)

# Save results to a new file for further use
df.to_csv("newdata/preprocessed_data.csv", index=False)

# Check the result
df.head()


Unnamed: 0,speaker,date,title,contents,Party,cleaned_contents,tokens
0,Sir J. Anderson,2/6/1940,Eire Citizens (Immigration),I have had no representations from the police ...,Conservative Party,i have had no representations from the police ...,"[representation, police, contrary]"
1,Sir J. Anderson,2/6/1940,Eire Citizens (Immigration),"No, Sir. Even if the principle underlying the ...",Conservative Party,no sir even if the principle underlying the ri...,"[sir, principle, underlie, right, hon, gentlem..."
2,Mr. M. MacDonald,2/14/1940,Palestine (Jewish Immigration),The steady improvement in the internal situati...,Labour Party,the steady improvement in the internal situati...,"[steady, improvement, internal, situation, pal..."
3,Mr. MacDonald,2/14/1940,Palestine (Jewish Immigration),"The legal quota allowed something over 10,000 ...",Labour Party,the legal quota allowed something over jews to...,"[legal, quota, allow, jews, settle, palestine,..."
4,Mr. MacDonald,2/14/1940,Palestine (Jewish Immigration),That is another question.,Labour Party,that is another question,[question]


In [None]:
import pandas as pd
from collections import Counter
from frame_terms import frame_words  # Assuming you saved this correctly
from group_terms import countries  # Assuming you saved this correctly

# Step 1: Match Frames
def match_frames(tokens):
    """Match tokens to frame terms and count occurrences."""
    frame_counts = Counter()
    for frame, terms in frame_words.items():
        frame_counts[frame] = sum(1 for token in tokens if token in terms)
    return frame_counts

# Step 2: Match Countries
def match_countries(text):
    """Identify countries or groups mentioned in the text."""
    matched_countries = []
    for country, terms in countries.items():
        if any(term.lower() in text for term in terms):
            matched_countries.append(country)
    return ", ".join(matched_countries) if matched_countries else "None"

# Step 3: Apply Matching Functions
def analyze_frames_and_groups(df):
    """Analyze frames and groups in the dataset."""
    # Add new columns for frame counts and matched countries
    df['frame_counts'] = df['tokens'].apply(match_frames)
    df['matched_countries'] = df['cleaned_contents'].apply(match_countries)
    return df

# Load your preprocessed dataset
#file_path = 'newdata/new_final_data.csv'
#df = pd.read_csv(file_path)

# Analyze Frames and Countries
df = analyze_frames_and_groups(df)

# Save Results
#output_path = 'analyzed_frames_groups.csv'
#df.to_csv(output_path, index=False)

#print(f"Analyzed data saved to {output_path}")

# Preview the result
print(df[['speaker', 'date', 'title', 'party', 'frame_counts', 'matched_countries']].head())
