# Imports

In [None]:
import pandas as pd
import lzma
from bs4 import BeautifulSoup
import unicodedata
import re

# Unzip

In [None]:
# Path to the .tsv.xz file
compressed_file_path = "data/00_downloads/dataset.tsv.xz"

# Extract and read the .tsv.xz file directly into a pandas DataFrame
with lzma.open(compressed_file_path) as file:
    df = pd.read_csv(file, delimiter='\t')

df_unique = df.drop_duplicates(subset=['content_id'], keep='first')


# Split into Paragraphs

In [None]:
# Step 1: Extract the relevant columns
text_column = df_unique['content']
id_column = df_unique['id']
medium_code_column = df_unique['medium_code']
doctype_column = df_unique['doctype']
language_column = df_unique['language']
head_column = df_unique['head']

# Step 2: Function to extract paragraphs from the text
def extract_paragraphs(text):
    soup = BeautifulSoup(text, 'html.parser')
    paragraphs = soup.find_all('p')
    return [p.get_text() for p in paragraphs]

# Step 3: Apply the function to each row and create chunks
df_unique['text'] = text_column.apply(extract_paragraphs)

# Step 4: Flatten the list of paragraphs and add the original 'id' as a column
all_paragraphs = []
ids = []
medium_codes = []
doctypes = []
languages = []
heads = []

for original_id, medium_code, doctype, language, head, paragraph_list in zip(id_column,
                                                                             medium_code_column,
                                                                             doctype_column,
                                                                             language_column,
                                                                             head_column,
                                                                             df_unique['text']):
    for paragraph in paragraph_list:
        all_paragraphs.append(paragraph)
        ids.append(original_id)  # Use the original 'id'
        medium_codes.append(medium_code)
        doctypes.append(doctype)
        languages.append(language)
        heads.append(head)

# Step 5: Create a DataFrame for the Paragraphs Dataset
df_paragraphs = pd.DataFrame({
    'id': ids,
    'text': all_paragraphs,
    'medium_code': medium_codes,
    'doctype': doctypes,
    'language': languages,
    'head': heads
})

# Remove Paragraphs that don't include one of our keywords

In [6]:
# List of keywords
keywords = [
    "Lebensmittelsystem",
    "Nachhaltigkeit",
    "Landwirtschaft",
    "Klimawandel",
    "Agrarpolitik",
    "Biodiversität",
    "Lebensmittelsicherheit",
    "Regionalität",
    "Agroökologie",
    "Pestizidfreiheit",
    "Ernährung",
    "Treibhausgas",
    "Biolandbau",
    "Importabhängigkeit",
    "Ernährungspolitik",
    "Système alimentaire",
    "Durabilité",
    "Agriculture",
    "Climat",
    "Politique agricole",
    "Biodiversité",
    "Sécurité alimentaire",
    "Localité",
    "Agroécologie",
    "pesticides",
    "Alimentation",
    "Gaz à effet de serre",
    "Agriculture biologique",
    "Importation",
    "Politique alimentaire",
    "Sistema alimentare",
    "Sostenibilità",
    "Agricoltura",
    "Clima",
    "Politica agricola",
    "Biodiversità",
    "Sicurezza alimentare",
    "Località",
    "Agroecologia",
    "pesticidi",
    "Alimentazione",
    "Gas serra",
    "Agricoltura biologica",
    "Importazioni",
    "Politica alimentare"
]

# Convert the list of keywords to a Pandas Series
keywords_series = pd.Series(keywords)

In [7]:
df = df_paragraphs

# Function to normalize and remove accents
def normalize(text):
    if isinstance(text, str):  # Ensure the value is a string
        text = unicodedata.normalize('NFKD', text)
        text = text.encode('ASCII', 'ignore').decode('utf-8')
        return text.lower()
    return ''  # Return an empty string for non-string values


# Normalize and convert the 'text' column
df['text_normalized'] = df['text'].apply(normalize)

# Normalize the keywords
normalized_keywords = [normalize(keyword) for keyword in keywords]

# Create a regex pattern for the keywords
pattern = '|'.join(re.escape(keyword) for keyword in normalized_keywords)

# Create a boolean mask where any normalized keyword is found in the normalized 'text' column
mask = df['text_normalized'].str.contains(pattern, case=False, na=False)

# Apply the mask to filter the DataFrame
filtered_df = df[mask]

# Drop the normalized column (optional)
filtered_df = filtered_df.drop(columns=['text_normalized'])

# Create unique IDs for the filtered DataFrame
filtered_df['unique_id'] = range(1, len(filtered_df) + 1)


## Saving

In [None]:
# Save the paragraphs to a new CSV
filtered_df.to_csv('data/01_text_data/paragraphs.csv', index=False)