In [3]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

# Ensure you have the punkt tokenizer downloaded
nltk.download("punkt")
nltk.download("stopwords")

# Define categories and keywords
categories = {
    "Allegations": ["allegation", "irregularity", "accused", "suspected", "claimed"],
    "Criminal Violations": ["criminal", "violation", "illegal", "fraud", "offense"],
    "Sentencing": ["sentenced", "punishment", "penalty", "fined", "jailed"],
    "Background Information": ["background", "context", "history", "information"],
}

# Define function to categorize sentences
def categorize_sentence(sentence):
    for category, keywords in categories.items():
        if any(keyword.lower() in sentence.lower() for keyword in keywords):
            return category
    return "Other"

# Load Excel data
file_path = "wikileaks_parsed.xlsx"  # Adjust if the file has a different name or location
df = pd.read_excel(file_path)

# Ensure 'Text' column exists
if "Text" not in df.columns:
    raise ValueError("The Excel file must have a 'Text' column.")

# Split text into sentences and categorize
data = []
for index, row in df.iterrows():
    pdf_path = row["PDF Path"]
    text = row["Text"]
    sentences = sent_tokenize(text)
    
    for sentence in sentences:
        category = categorize_sentence(sentence)
        data.append({
            "PDF Path": pdf_path,
            "Sentence": sentence,
            "Category": category,
        })

# Create a new DataFrame with categorized sentences
categorized_df = pd.DataFrame(data)

# Save to a new Excel file
output_file = "wikileaks_categorized.xlsx"
categorized_df.to_excel(output_file, index=False)
print(f"Categorized data saved to {output_file}")


[nltk_data] Downloading package punkt to /Users/rishi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/rishi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Categorized data saved to wikileaks_categorized.xlsx


In [4]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from collections import defaultdict

# Ensure you have the punkt tokenizer downloaded
nltk.download("punkt")

# Define categories and keywords
categories = {
    "Allegations": ["allegation", "irregularity", "accused", "suspected", "claimed"],
    "Criminal Violations": ["criminal", "violation", "illegal", "fraud", "offense"],
    "Sentencing": ["sentenced", "punishment", "penalty", "fined", "jailed"],
    "Background Information": ["background", "context", "history", "information"],
}

# Define function to categorize sentences
def categorize_sentence(sentence):
    scores = defaultdict(int)  # A dictionary to track scores for each category
    
    # Count the occurrences of keywords for each category
    for category, keywords in categories.items():
        for keyword in keywords:
            scores[category] += sentence.lower().count(keyword.lower())
    
    # Assign the sentence to the category with the highest score
    if scores:
        return max(scores, key=scores.get)  # Category with the highest score
    return None  # This shouldn't happen since there's always a category

# Load Excel data
file_path = "wikileaks_parsed.xlsx"  # Adjust if the file has a different name or location
df = pd.read_excel(file_path)

# Ensure 'Text' column exists
if "Text" not in df.columns:
    raise ValueError("The Excel file must have a 'Text' column.")

# Split text into sentences and categorize
data = []
for index, row in df.iterrows():
    pdf_path = row["PDF Path"] if "PDF Path" in row else None
    text = row["Text"]
    sentences = sent_tokenize(text)
    
    for sentence in sentences:
        category = categorize_sentence(sentence)
        data.append({
            "PDF Path": pdf_path,
            "Sentence": sentence,
            "Category": category,
        })

# Create a new DataFrame with categorized sentences
categorized_df = pd.DataFrame(data)

# Save to a new Excel file
output_file = "wikileaks_categorized.xlsx"
categorized_df.to_excel(output_file, index=False)
print(f"Categorized data saved to {output_file}")


Categorized data saved to wikileaks_categorized.xlsx


[nltk_data] Downloading package punkt to /Users/rishi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Step 1: Load labeled data
labeled_file = "wikileaks_categorized.xlsx"  # Replace with your labeled file
labeled_df = pd.read_excel(labeled_file)

# Ensure required columns
if "Sentence" not in labeled_df.columns or "Category" not in labeled_df.columns:
    raise ValueError("The labeled dataset must have 'Sentence' and 'Category' columns.")

# Step 2: Split data into training and test sets
X = labeled_df["Sentence"]
y = labeled_df["Category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 4: Train a classifier (Logistic Regression in this example)
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Step 5: Evaluate the model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

# Save the model and vectorizer for future use
joblib.dump(model, "text_classifier_model.pkl")
joblib.dump(vectorizer, "text_vectorizer.pkl")

# Step 6: Load your full dataset and categorize sentences
file_path = "wikileaks_parsed.xlsx"  # Replace with your dataset
df = pd.read_excel(file_path)

if "Text" not in df.columns:
    raise ValueError("The Excel file must have a 'Text' column.")

# Load saved model and vectorizer
model = joblib.load("text_classifier_model.pkl")
vectorizer = joblib.load("text_vectorizer.pkl")

# Process sentences
data = []
for index, row in df.iterrows():
    pdf_path = row["PDF Path"] if "PDF Path" in row else None
    text = row["Text"]
    sentences = nltk.sent_tokenize(text)
    
    for sentence in sentences:
        # Predict category for each sentence
        sentence_tfidf = vectorizer.transform([sentence])
        category = model.predict(sentence_tfidf)[0]
        data.append({
            "PDF Path": pdf_path,
            "Sentence": sentence,
            "Category": category,
        })

# Save results
categorized_df = pd.DataFrame(data)
output_file = "wikileaks_ml_categorized.xlsx"
categorized_df.to_excel(output_file, index=False)
print(f"Categorized data saved to {output_file}")


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                        precision    recall  f1-score   support

           Allegations       0.94      1.00      0.97       171
Background Information       1.00      0.14      0.25         7
   Criminal Violations       0.00      0.00      0.00         4

              accuracy                           0.95       182
             macro avg       0.65      0.38      0.41       182
          weighted avg       0.93      0.95      0.92       182

Categorized data saved to wikileaks_ml_categorized.xlsx


In [8]:
import pandas as pd
import spacy
from difflib import SequenceMatcher

# Load the processed Excel files
wikileaks_file = "processed_wikileaks_parsed.xlsx"
news_file = "processed_news_excerpts_parsed.xlsx"

wikileaks_df = pd.read_excel(wikileaks_file)
news_df = pd.read_excel(news_file)

# Load SpaCy model for NLP
nlp = spacy.load("en_core_web_sm")

def get_similarity(text1, text2):
    """Compute similarity between two texts using SequenceMatcher."""
    return SequenceMatcher(None, text1, text2).ratio()

def find_similar_entries(news_text, wikileaks_df):
    """Find similar entries from the Wikileaks dataset for a given news text."""
    results = []
    for _, row in wikileaks_df.iterrows():
        similarity = get_similarity(news_text, row['Text'])
        if similarity > 0.5:  # Threshold for similarity
            results.append({
                'PDF Path': row['PDF Path'],
                'Wikileaks Text': row['Text'],
                'Similarity': similarity,
                'Entities': row['entities'],
                'Relationships': row['relationships']
            })
    return results

# Process news articles and compare them with Wikileaks data
output = []
for _, news_row in news_df.iterrows():
    similar_entries = find_similar_entries(news_row['Text'], wikileaks_df)
    for entry in similar_entries:
        output.append({
            'News Link': news_row['Link'],
            'News Text': news_row['Text'],
            'Category': news_row['Category'],
            'Wikileaks PDF': entry['PDF Path'],
            'Wikileaks Text': entry['Wikileaks Text'],
            'Similarity': entry['Similarity'],
            'News Entities': news_row['entities'],
            'Wikileaks Entities': entry['Entities'],
            'Relationships': entry['Relationships']
        })

# Save the results to a new Excel file
output_df = pd.DataFrame(output)
output_df.to_excel("similarity_results.xlsx", index=False)

print("Comparison complete. Results saved to similarity_results.xlsx.")


Comparison complete. Results saved to similarity_results.xlsx.


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def load_data(wikileaks_file, news_file):
    # Load the processed data from Excel files
    wikileaks_df = pd.read_excel(wikileaks_file)
    news_df = pd.read_excel(news_file)
    return wikileaks_df, news_df

def calculate_similarity(wikileaks_df, news_df):
    # Combine text data from Wikileaks and News
    wikileaks_texts = wikileaks_df['Text']
    news_texts = news_df['Text']

    # Use TF-IDF to vectorize the text data
    vectorizer = TfidfVectorizer()
    combined_texts = pd.concat([wikileaks_texts, news_texts])
    tfidf_matrix = vectorizer.fit_transform(combined_texts)

    # Split the TF-IDF matrix for Wikileaks and News
    wikileaks_tfidf = tfidf_matrix[:len(wikileaks_texts)]
    news_tfidf = tfidf_matrix[len(wikileaks_texts):]

    # Calculate cosine similarity between Wikileaks and News texts
    similarity_matrix = cosine_similarity(wikileaks_tfidf, news_tfidf)
    return similarity_matrix

def categorize_and_cite(similarity_matrix, wikileaks_df, news_df, threshold=0.5):
    results = []
    for i, wikileaks_row in wikileaks_df.iterrows():
        for j, news_row in news_df.iterrows():
            similarity_score = similarity_matrix[i, j]
            if similarity_score >= threshold:
                result = {
                    'Wikileaks PDF Path': wikileaks_row['PDF Path'],
                    'Wikileaks Text': wikileaks_row['Text'],
                    'News Link': news_row['Link'],
                    'News Text': news_row['Text'],
                    'Similarity Score': similarity_score,
                    'Entities Matched': set(wikileaks_row['entities']) & set(news_row['entities']),
                    'Relationships Matched': set(wikileaks_row['relationships']) & set(news_row['relationships']),
                }
                results.append(result)
    return pd.DataFrame(results)

def save_results(results_df, output_file):
    # Save the results to an Excel file
    results_df.to_excel(output_file, index=False)

if __name__ == "__main__":
    # File paths for input and output
    wikileaks_file = "wikileaks_parsed.xlsx"
    news_file = "news_excerpts_parsed.xlsx"
    output_file = "similarity_results.xlsx"

    # Load data
    wikileaks_df, news_df = load_data(wikileaks_file, news_file)

    # Calculate similarity
    similarity_matrix = calculate_similarity(wikileaks_df, news_df)

    # Categorize and cite similarities
    results_df = categorize_and_cite(similarity_matrix, wikileaks_df, news_df)

    # Save results
    save_results(results_df, output_file)

    print(f"Similarity results saved to {output_file}")


Similarity results saved to similarity_results.xlsx


In [10]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data from Excel files
wikileaks_df = pd.read_excel("wikileaks_parsed.xlsx")
news_df = pd.read_excel("news_excerpts_parsed.xlsx")
processed_wikileaks_df = pd.read_excel("processed_wikileaks_parsed.xlsx")
processed_news_df = pd.read_excel("processed_news_excerpts_parsed.xlsx")

# Load spaCy model for named entity recognition (NER)
nlp = spacy.load("en_core_web_sm")

# Preprocess text (tokenize, remove stopwords, and lemmatize)
def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

wikileaks_df['clean_text'] = wikileaks_df['Text'].apply(preprocess_text)
news_df['clean_text'] = news_df['Text'].apply(preprocess_text)

# Use TfidfVectorizer to convert the text into vectors for similarity computation
vectorizer = TfidfVectorizer()

# Function to compute cosine similarity
def compute_similarity(query, text_list):
    tfidf_matrix = vectorizer.fit_transform(text_list + [query])  # Append query to text list
    return cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

# Example of how to categorize and extract similarity with a sample news excerpt
def get_similar_wikileaks_content(news_row):
    news_text = news_row['clean_text']
    similarities = compute_similarity(news_text, wikileaks_df['clean_text'])
    best_match_index = similarities.argmax()
    
    # Fetch best matching wikileaks case
    best_match_row = wikileaks_df.iloc[best_match_index]
    category = "Allegation"  # Default to 'Allegation' or use a rule to determine categories
    # Check if the category already exists in the processed data and assign
    if 'Category' in best_match_row and best_match_row['Category']:
        category = best_match_row['Category']
    
    # Assuming 'sentencing' or 'conclusion' info is available in the processed_wikileaks_df
    sentence_or_conclusion = processed_wikileaks_df[processed_wikileaks_df['PDF Path'] == best_match_row['PDF Path']]['entities'].iloc[0]
    
    return {
        "news_link": news_row['Link'],
        "wikileaks_pdf_path": best_match_row['PDF Path'],
        "similarity_score": similarities[best_match_index],
        "category": category,
        "sentencing_or_conclusion": sentence_or_conclusion
    }

# Iterate through news_df and find similarities
results = []
for index, news_row in news_df.iterrows():
    result = get_similar_wikileaks_content(news_row)
    results.append(result)

# Convert results into a DataFrame
results_df = pd.DataFrame(results)

# Display the results
print(results_df)


                                              news_link wikileaks_pdf_path  \
0     https://edition.cnn.com/2023/09/29/business/st...             89.pdf   
1     https://www.channelnewsasia.com/singapore/su-w...             89.pdf   
2     https://edition.cnn.com/2023/05/22/tech/meta-f...             89.pdf   
3     https://www.channelnewsasia.com/singapore/bill...             89.pdf   
4     https://edition.cnn.com/2024/03/05/politics/li...             89.pdf   
...                                                 ...                ...   
1504  https://www.channelnewsasia.com/commentary/mal...             89.pdf   
1505  https://www.channelnewsasia.com/singapore/tick...             89.pdf   
1506     https://www.bbc.com/news/world-europe-57965260             89.pdf   
1507  https://www.bbc.com/news/uk-wales-mid-wales-13...             89.pdf   
1508  https://www.straitstimes.com/singapore/parliam...             89.pdf   

      similarity_score    category  \
0             0.317713  A

In [11]:
# Save the results DataFrame to an Excel file
results_df.to_excel("wikileaks_news_similarity_results.xlsx", index=False)

print("Results saved to 'wikileaks_news_similarity_results.xlsx'")


Results saved to 'wikileaks_news_similarity_results.xlsx'


In [14]:
import pandas as pd
from transformers import pipeline
import torch

# Load the data
data = pd.read_excel('wikileaks_parsed.xlsx')

# Load a pre-trained text classification model
# In this case, we can use a zero-shot classification model from Hugging Face
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

# Define possible categories
candidate_labels = ['Allegation', 'Investigative Details', 'Background Information', 'Charges', 'Sentencing', 'Criminal Violations']

# Function to classify each row of text
def classify_text(text):
    result = classifier(text, candidate_labels)
    return result['labels'][0]  # Return the top predicted label

# Apply the function to the 'Text' column
data['Category'] = data['Text'].apply(classify_text)

# Save the result to a new Excel file
data.to_excel('wikileaks_parsed_with_categories.xlsx', index=False)

print("Categorization complete and saved to 'wikileaks_parsed_with_categories.xlsx'")


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use mps:0


Categorization complete and saved to 'wikileaks_parsed_with_categories.xlsx'


In [1]:
import pandas as pd
from transformers import pipeline
import os

# Load the data from the Excel file
data = pd.read_excel('news_excerpts_parsed.xlsx')

# Load the zero-shot classification model
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

# Define the candidate labels
candidate_labels = ['Allegation', 'Investigative Details', 'Background Information', 'Charges', 'Sentencing', 'Criminal Violations']

# Check if a previous output file exists
output_file = 'news_excerpts_parsed_with_categories.xlsx'
if os.path.exists(output_file):
    # Load the previously processed data
    existing_data = pd.read_excel(output_file)
    processed_indices = existing_data.index.tolist()
else:
    # If no previous progress, start fresh
    processed_indices = []

# Function to classify the text based on the context
def classify_text(text):
    result = classifier(text, candidate_labels)
    return result['labels'][0]  # Returning the top predicted category

# Apply classification only to unprocessed rows
def process_data(data):
    for index, row in data.iterrows():
        if index not in processed_indices:
            # Classify the text and add the category
            category = classify_text(row['Text'])
            data.at[index, 'Category'] = category
            
            # Save progress periodically to avoid losing data
            if (index + 1) % 100 == 0:  # Save progress every 100 rows
                data.to_excel(output_file, index=False)
                print(f"Saved progress at row {index + 1}")
    
    return data

# Process the data (classify and save progress)
data = process_data(data)

# Save the final result
data.to_excel(output_file, index=False)

print("Categorization complete and saved to 'news_excerpts_parsed_with_categories.xlsx'")


Device set to use mps:0


Saved progress at row 100
Saved progress at row 200
Saved progress at row 300
Saved progress at row 400
Saved progress at row 500
Saved progress at row 600
Saved progress at row 700
Saved progress at row 800
Saved progress at row 900
Saved progress at row 1000
Saved progress at row 1100
Saved progress at row 1200
Saved progress at row 1300
Saved progress at row 1400
Saved progress at row 1500
Categorization complete and saved to 'news_excerpts_parsed_with_categories.xlsx'


In [6]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the data
wikileaks_data = pd.read_excel('wikileaks_parsed_with_categories.xlsx')
news_data = pd.read_excel('news_excerpts_parsed_with_categories.xlsx')

# Initialize the Sentence-BERT model for text embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get the embedding for a list of texts
def get_embeddings(texts):
    return model.encode(texts, convert_to_tensor=True)

# Get embeddings for the Wikileaks data and the news article text
wikileaks_embeddings = get_embeddings(wikileaks_data['Text'].tolist()).cpu().numpy()
news_data['embedding'] = news_data['Text'].apply(lambda x: model.encode(x))  # Removed `.cpu()`

# Function to retrieve the most similar wikileaks excerpts based on cosine similarity
def find_most_similar_citations(news_article_text):
    # Get the embedding of the news article (already a NumPy array)
    news_article_embedding = model.encode(news_article_text)  # Removed `.cpu()`
    
    # Compute cosine similarities between the news article and all wikileaks excerpts
    similarities = cosine_similarity([news_article_embedding], wikileaks_embeddings)
    
    # Get the indices of the most similar wikileaks citations
    most_similar_indices = np.argsort(similarities[0])[::-1]
    
    similar_citations = []
    
    # Get top N most similar citations and their categories
    for index in most_similar_indices[:5]:  # Adjust the number 5 as needed
        citation_text = wikileaks_data.iloc[index]['Text']
        categories = wikileaks_data.iloc[index]['Category']
        similarity_score = similarities[0][index]
        
        # Append the citation text, its categories, and similarity score
        similar_citations.append({
            'Citation Text': citation_text,
            'Categories': categories,
            'Similarity Score': similarity_score
        })
    
    return similar_citations

# Example usage:
news_article = "Starbucks violated federal labor law when it increased wages and offered new perks and benefits only to non-union employees, a National Labor Relations Board judge found Thursday. The decision is the latest in a series of NLRB rulings finding that Starbucks has violated labor law in its efforts to stop unions from forming in its coffee shops. The issue at the heart of this case is whether, under current Board law, [Starbucks] was entitled to explicitly reward employees, for not participating in union activity, while falsely telling its workers that the federal labor law forced it to take this action, wrote administrative law judge Mara-Louise Anzalone. It was not."
similar_citations = find_most_similar_citations(news_article)

for citation in similar_citations:
    print(f"Most Similar Citation: {citation['Citation Text']}")
    print(f"Categories: {citation['Categories']}")
    print(f"Similarity Score: {citation['Similarity Score']}")
    print("\n")


Most Similar Citation: "Allegation

In July 2004, the Investigation Task Force (ITF) was provided with a copy of a letter dated 28 May 2004, from the Cargo Manager of an Airline to a Staff Member of Public Enterprise Airport Pristina. In the letter, the Cargo Manager referred to several problems relative to an Invoice for April 2004 issued by the Cargo Department of Pristina Airport for the handling of the outgoing mail of a Member State’s KFOR. At the time, Pristina Airport staff could not locate a record of this invoice in the airport’s financial records, and, therefore, the commission of fraud on the part of staff in the Cargo Department was suspected."
Categories: Allegation
Similarity Score: 0.34018653631210327


Most Similar Citation: Non Staff member 1’s statements provided to the ITF Investigators, disclosed the following  two distinct cases:  

Case-B  

Non Staff member 1 stated that he/she was aware of another case of kickbacks for Pristina Airport employment, which occurred

In [7]:
news_article = "The first suspect to plead guilty in Singapore's largest money laundering case was convicted and sentenced to 13 months' jail in a district court on Tuesday (Apr 2). Su Wenqiang, 32, admitted to 11 charges of money laundering, possessing proceeds from illegal remote gambling offences and lying to get work passes for himself and his wife. More than S$3 billion (US$2.2 billion) in assets have been seized or frozen in relation to the case. This likely makes it one of the largest money laundering operations in the world. Su was among 10 suspects arrested in simultaneous police raids last August. The Cambodian national, whose passport states that he is from Fujian, was nabbed in a Good Class Bungalow along Lewis Road in Bukit Timah."
similar_citations = find_most_similar_citations(news_article)

for citation in similar_citations:
    print(f"Most Similar Citation: {citation['Citation Text']}")
    print(f"Categories: {citation['Categories']}")
    print(f"Similarity Score: {citation['Similarity Score']}")
    print("\n")


Most Similar Citation: "IV. CRIMINAL VIOLATIONS

Based on the referral from the ITF to the Department of Justice on 26 July 2004, arrest warrants were issued against Official 1 and Official 2. Each charge is detailed below:

Count 1

That between the dates 14th March 2004 and 16th March 2004 inclusive, Official 1 and Official 2 acting in their capacity as officials of Airport Pristina, and acting in complicity with the intention to obtain an unlawful material benefit, presented a representative of the Company with a false statement of account claiming reduced storage fees, and in so doing, misled an authorized person to carry out an unlawful payment, thereby committing the offence of Fraud in Service in violation of Article 215, (1) and (2) of the Kosovo Criminal Code (KCC) punishable by imprisonment of one to ten years

{equivalent to Fraud in Office contrary to Article 341 (1) and (2) of Provisional Kosovo Criminal Code (PCCK)};

and Complicity, in violation of Article 22 of Criminal

In [8]:
import pandas as pd

# Load both Excel files
wikileaks_data = pd.read_excel('wikileaks_parsed_with_categories.xlsx')
processed_data = pd.read_excel('processed_wikileaks_parsed.xlsx')

# Ensure both DataFrames have the same case for matching columns
wikileaks_data['PDF Path'] = wikileaks_data['PDF Path'].str.strip()
processed_data['PDF Path'] = processed_data['PDF Path'].str.strip()
wikileaks_data['Text'] = wikileaks_data['Text'].str.strip()
processed_data['Text'] = processed_data['Text'].str.strip()

# Merge the dataframes on 'PDF Path' and 'Text'
merged_data = pd.merge(processed_data, wikileaks_data[['PDF Path', 'Text', 'Category']],
                       on=['PDF Path', 'Text'], how='left')

# Save the updated DataFrame to a new Excel file
merged_data.to_excel('processed_wikileaks_parsed_with_category.xlsx', index=False)

print("The 'Category' column has been successfully appended to the processed file.")


The 'Category' column has been successfully appended to the processed file.


In [9]:
import pandas as pd

# Load both Excel files
wikileaks_data = pd.read_excel('news_excerpts_parsed_with_categories.xlsx')
processed_data = pd.read_excel('processed_news_excerpts_parsed.xlsx')

# Ensure both DataFrames have the same case for matching columns
wikileaks_data['Link'] = wikileaks_data['Link'].str.strip()
processed_data['Link'] = processed_data['Link'].str.strip()
wikileaks_data['Text'] = wikileaks_data['Text'].str.strip()
processed_data['Text'] = processed_data['Text'].str.strip()

# Merge the dataframes on 'PDF Path' and 'Text'
merged_data = pd.merge(processed_data, wikileaks_data[['Link', 'Text', 'Category']],
                       on=['Link', 'Text'], how='left')

# Save the updated DataFrame to a new Excel file
merged_data.to_excel('processed_news_excerpts_parsed_with_category.xlsx', index=False)

print("The 'Category' column has been successfully appended to the processed file.")


The 'Category' column has been successfully appended to the processed file.


In [3]:
import pandas as pd

# Load the Excel file
input_file = "./data/sentencebert_results.xlsx"
df = pd.read_excel(input_file)

# Sort by 'Method', 'news_Link', and 'content_similarity' in descending order
df_sorted = df.sort_values(by=['Method', 'news_Link', 'content_similarity'], ascending=[True, True, False])

# Select the top 5 Wikileaks documents for each 'Method' and 'news_Link' group
df_top5 = df_sorted.groupby(['Method', 'news_Link']).head(5).reset_index(drop=True)

# Save the resulting DataFrame to a new Excel file
output_file = "sentencebert_top5_wikileaks_results.xlsx"
df_top5.to_excel(output_file, index=False)

print(f"Top 5 Wikileaks documents per group saved to {output_file}")


Top 5 Wikileaks documents per group saved to sentencebert_top5_wikileaks_results.xlsx


In [5]:
import pandas as pd

# Load the Excel file
df = pd.read_excel("./data/cited_judgments_with_news_articles.xlsx")

# Sort the dataframe by 'news_Link' and 'content_similarity' in descending order
df_sorted = df.sort_values(by=['news_Link', 'content_similarity'], ascending=[True, False])

# Group by 'news_Link' and select the top 5 rows for each group
df_top5 = df_sorted.groupby('news_Link').head(5).reset_index(drop=True)

# Save the result to a new Excel file
df_top5.to_excel("top5_cited_judgments_with_news_articles.xlsx", index=False)

print("Filtered Excel file with top 5 Wikileaks documents for each news excerpt has been saved.")

Filtered Excel file with top 5 Wikileaks documents for each news excerpt has been saved.


In [6]:
import pandas as pd

# Load the original Excel files
cited_judgments_file = "./data/top5_cited_judgments_with_news_articles.xlsx"
sentencebert_file = "./data/sentencebert_top5_wikileaks_results.xlsx"

# Read the cited judgments data
cited_df = pd.read_excel(cited_judgments_file)

# Sort by content similarity and keep the top 5 rows per news_Link
top5_cited_df = (
    cited_df.sort_values(by=["news_Link", "content_similarity"], ascending=[True, False])
    .groupby("news_Link")
    .head(5)
)

# Read the SentenceBERT results
top5_sentencebert_df = pd.read_excel(sentencebert_file)

# Merge the top 5 cited judgments with SentenceBERT results
# Assuming both datasets have the `news_Link` column for alignment
merged_df = pd.merge(top5_sentencebert_df, top5_cited_df, on="news_Link", how="outer")

# Save the merged results into a new Excel file
output_file = "combined_top5_results.xlsx"
merged_df.to_excel(output_file, index=False)

print(f"Combined file saved as {output_file}")


Combined file saved as combined_top5_results.xlsx
