In [1]:

# imports
import pandas as pd
import glob
import os
import nltk
from nltk.tokenize import word_tokenize
from transformers import CamembertTokenizer 
import numpy as np
import tqdm

nltk.download('punkt')
nltk.download('punkt_tab')

camembert_tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-ccnet") 

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/lilla.conte/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/lilla.conte/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [39]:
# calculate average number of tokens per article (per row) in annotated 'rixes' sample data

csv_file = '../data/annotated_dataset_deduped.csv'
df = pd.read_csv(csv_file)

if 'text' in df.columns:
    token_counts = df['text'].fillna('').apply(lambda x: len(camembert_tokenizer.tokenize(x)))
    
    average = token_counts.mean()
    median = np.median(token_counts)
    over_500_count = (token_counts > 500).sum()
    percent_over_500 = (over_500_count / len(token_counts)) * 100 if len(token_counts) > 0 else 0

    print(f"Average number of tokens per row: {average:.2f}")
    print(f"Median: {median:.2f}")
    print(f"Percentage of rows with > 500 tokens: {percent_over_500:.2f}%")
else:
    print("Column 'text' not found in the CSV file.")

Average number of tokens per row: 581.93
Median: 400.00
Percentage of rows with > 500 tokens: 44.75%


In [38]:
# calculate average number of tokens per article (per row) in 'italiens/belges' dataset

from tqdm import tqdm

folder_path = '/store/retronews/etrangers'
csv_files = [f for f in glob.glob(os.path.join(folder_path, '*.csv')) if not f.endswith("pattern.csv")]

total_tokens = 0
total_rows = 0
all_token_counts = []
over_500_percentages = []

for file in tqdm(csv_files, desc="Processing CSV files"):
    filename = os.path.basename(file)
    try:
        df = pd.read_csv(file, encoding='utf-8', low_memory=False)

        if 'text' in df.columns and not df.empty:
            df['text'] = df['text'].fillna('')
            token_counts = df['text'].apply(lambda x: len(camembert_tokenizer.tokenize(x)))

            total_tokens += token_counts.sum()
            total_rows += token_counts.count()
            all_token_counts.extend(token_counts.tolist())

            over_500 = (token_counts > 500).sum()
            percent_over_500 = (over_500 / len(token_counts)) * 100 if len(token_counts) > 0 else 0
            over_500_percentages.append(percent_over_500)

    except Exception as e:
        print(f"Error processing {filename}: {e}")

if over_500_percentages:
    avg_percent_over_500 = np.mean(over_500_percentages)
    print(f"\nAverage percentage of rows with > 500 tokens across files: {avg_percent_over_500:.2f}%")

if total_rows > 0:
    true_average = total_tokens / total_rows
    global_median = np.median(all_token_counts)
    print(f"\nTrue average tokens per row (across all files): {true_average:.2f}")
    print(f"Global median tokens per row: {global_median:.2f}")
else:
    print("\nNo valid rows found.")

Processing CSV files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [08:04<00:00, 10.77s/it]


Average percentage of rows with > 500 tokens across files: 11.23%

True average tokens per row (across all files): 206.66
Global median tokens per row: 84.00





In [33]:
# calculate the number and percentage of events annotated as 'relevant' in annotated 'rixes' sample data

csv_file = '../data/annotated_dataset_deduped.csv'
df = pd.read_csv(csv_file) 

if 'relevant' in df.columns:
    total_rows = len(df)
    relevant_count = df['relevant'].sum()  # sums all 1s, since values are 0 or 1
    relevant_percentage = (relevant_count / total_rows) * 100 if total_rows > 0 else 0

    print(f"Number of relevant events: {relevant_count}")
    print(f"Percentage of relevant events: {relevant_percentage:.2f}%")
else:
    print("Column 'relevant' not found in the CSV file.")



Number of relevant events: 176
Percentage of relevant events: 59.66%


In [10]:
# compare bert tokenizer with ntlk tokenizer

from transformers import CamembertTokenizer 
from nltk.tokenize import word_tokenize
import nltk

doc = "Rixe sanglante Une rixe sanglante a éclaté la nuit dernière a Marseille dans le quartier avoisinant le vieux port, entre des ma rins français et des journaliers italiens."

camembert_tokens = camembert_tokenizer.tokenize(doc)
nltk_tokens = word_tokenize(doc)

# results
print("\nOriginal sentence:")
print(doc)

print("\nCamemBERT tokens:")
print(camembert_tokens)
print(len(camembert_tokens))

print("\nNLTK tokens:")
print(nltk_tokens)
print(len(nltk_tokens))


[nltk_data] Downloading package punkt to
[nltk_data]     /home/lilla.conte/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Original sentence:
Rixe sanglante Une rixe sanglante a éclaté la nuit dernière a Marseille dans le quartier avoisinant le vieux port, entre des ma rins français et des journaliers italiens.

CamemBERT tokens:
['▁R', 'ix', 'e', '▁sanglante', '▁Une', '▁ri', 'x', 'e', '▁sanglante', '▁a', '▁éclaté', '▁la', '▁nuit', '▁dernière', '▁a', '▁Marseille', '▁dans', '▁le', '▁quartier', '▁avoisinant', '▁le', '▁vieux', '▁port', ',', '▁entre', '▁des', '▁ma', '▁', 'rin', 's', '▁français', '▁et', '▁des', '▁journalier', 's', '▁italiens', '.']
37

NLTK tokens:
['Rixe', 'sanglante', 'Une', 'rixe', 'sanglante', 'a', 'éclaté', 'la', 'nuit', 'dernière', 'a', 'Marseille', 'dans', 'le', 'quartier', 'avoisinant', 'le', 'vieux', 'port', ',', 'entre', 'des', 'ma', 'rins', 'français', 'et', 'des', 'journaliers', 'italiens', '.']
30


In [50]:
# calculate how many rows have fewer than x number of words in rixe sample

csv_file = '../data/annotated_dataset_deduped.csv'
df = pd.read_csv(csv_file)

word_limit = 15

if 'text' in df.columns:
    word_counts = df['text'].fillna('').apply(lambda x: len(x.split()))
    percent_under_x = (word_counts < word_limit).mean() * 100
    print(f"Percentage of rows with < {word_limit} words: {percent_under_x:.2f}%")
else:
    print("'text' not found in the CSV file.")


Percentage of rows with < 13 words: 0.34%


In [52]:
# calculate how many rows have fewer than x number of words in italiens/belges sample

folder_path = '/store/retronews/etrangers'
csv_files = [f for f in glob.glob(os.path.join(folder_path, '*.csv')) if not f.endswith("pattern.csv")]

total_rows = 0
percent_under_x = 0
word_limit = 5

for file in tqdm(csv_files, desc="processing files"):
    try:
        df = pd.read_csv(file, encoding='utf-8', low_memory=False)
        if 'text' in df.columns and not df.empty:
            df['text'] = df['text'].fillna('')
            word_counts = df['text'].apply(lambda x: len(x.split()))
            percent_under_x += (word_counts < word_limit).sum()
            total_rows += len(word_counts)
    except Exception as e:
        print(f"Error processing {file}: {e}")

if total_rows > 0:
    percent_under_10 = (percent_under_x / total_rows) * 100
    print(f"\nOverall percentage of rows with < {word_limit} words: {percent_under_10:.2f}%")
else:
    print("no valid rows found.")

processing files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:26<00:00,  1.67it/s]


Overall percentage of rows with < 5 words: 5.11%





In [3]:
# calculate the percentage of events which are relevant in the rixe sample but also present in the archives database

rixes_path = '../data/annotated_dataset_deduped.csv'
archives_path = '../data/exploded_archives_annotated.csv'

df1 = pd.read_csv(rixes_path)
df2 = pd.read_csv(archives_path)

if "uniqueid" in df1.columns and "relevant" in df1.columns and "retronews_uniqueid" in df2.columns:
    relevant_df1 = df1[df1['relevant'] == 1]
    matching_ids = set(df2['retronews_uniqueid'])
    matched_relevant = relevant_df1[relevant_df1['uniqueid'].isin(matching_ids)]

# calculate percentage of overlap
    total_relevant = len(relevant_df1)
    matched_count = len(matched_relevant)
    percentage_events_overlap = (matched_count / total_relevant) * 100 if total_relevant > 0 else 0

    print(f"Percentage of overlapping events {percentage_events_overlap}")


Percentage of overlapping events 0.5681818181818182
