In [263]:
from tqdm import tqdm
import pandas as pd
import re
from langdetect import detect
from translate import Translator
from tqdm import tqdm; tqdm.pandas()
import os
import locale
from datetime import datetime
from dateutil import parser
from textblob import TextBlob

## merging data

In [264]:
countries = ["NL", "ES", "UK", "FR", "DE", "IT"]

df = []

length = 0

for c in tqdm(countries):
    globals()[c] = pd.read_csv(f"data\\{c}.csv")
    globals()[c]["country"] = c
    
    length += len(globals()[c])
    
    df.append(globals()[c])

df = pd.concat(df, ignore_index = True)
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

print(f"Length of df equals sum of seperate dataframes = {len(df) == length}.")

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  4.57it/s]

Length of df equals sum of seperate dataframes = True.





## filtering wrong columns

In [265]:
round(1 - df["datePublished"].str.contains(r"\d", na=False).mean(), 2)

0.04

In [266]:
df.to_csv("data\\merged.csv")

In [267]:
df = pd.read_csv("data\\merged.csv", index_col=0)

In [268]:
def fix_columns(df):
    for idx in df.index:
        if pd.isnull(df.loc[idx, 'news_source']):
            df.loc[idx, 'news_source'] = df.loc[idx, 'datePublished']
            df.loc[idx, 'datePublished'] = df.loc[idx, 'region']
            df.loc[idx, 'region'] = None
    return df

df_fixed = fix_columns(df)

In [269]:
round(1 - df["datePublished"].str.contains(r"\d", na=False).mean(), 2)

0.01

In [270]:
df_filtered = df_fixed[df_fixed["datePublished"].str.contains(r"\d", na=False)]

In [271]:
df_filtered.news_source.unique()

array(['De Telegraaf', 'NRC Handelsblad',
       'De Grote Prijs van Nederland wordt vanaf 2020 weer verreden op Circuit Zandvoort sport ; Formule 1 in Zandvoort',
       'El Pais', 'El Mundo', 'La bicis vuelven a su origen.',
       'Una ciudad a pedales.', 'The Daily Telegraph (London)',
       'The Daily Telegraph (LONDON)', 'The Guardian',
       'THE DAILY TELEGRAPH(LONDON)', 'Libération', 'Le Figaro',
       'Allgemeine Zeitung (Germany)', 'Der Spiegel', 'La Stampa',
       'ItaliaOggi'], dtype=object)

In [272]:
df_filtered = df_filtered[~df_filtered["news_source"].str.contains(r"[.;]", na=False)]

## streamlining newspaper names

In [273]:
def rename_newspaper(x):
    x = str(x).lower()
    if "telegraph" in x:
        x = "the daily telegraph"
    elif "allgemeine zeitung" in x:
        x = "allgemeine zeitung"
    return x

In [274]:
df_filtered["news_source"] = df.news_source.apply(lambda x: rename_newspaper(x))

In [275]:
df_filtered.groupby("news_source").size()

news_source
allgemeine zeitung     962
de telegraaf           536
der spiegel             20
el mundo               306
el pais                589
italiaoggi             400
la stampa              499
le figaro              655
libération             226
nrc handelsblad        188
the daily telegraph    973
the guardian            11
dtype: int64

In [276]:
df_filtered.to_csv("data\\master_data.csv")

In [277]:
df_master = pd.read_csv("data\\master_data_translated.csv", index_col=0)

## parsing dates

In [279]:
def extract_and_format_date(date_str):
    try:
        date_obj = parser.parse(date_str, fuzzy=True)
        return date_obj.strftime("%d/%m/%Y")
    except Exception as e:
        print(f"Error parsing date '{date_str}': {e}")
        return None
    
df_master["datePublished_clean"] = df_master.datePublished.apply(extract_and_format_date)

In [283]:
df_master.to_csv("data\\supreme_data.csv")

In [284]:
df_supreme = pd.read_csv("data\\supreme_data.csv", index_col=0)

## sentiment scores

In [235]:
def get_sentiment_spacey(text):
    blob = TextBlob(text).sentiment.polarity
    return blob.sentiment.polarity

In [243]:
df_supreme["sentiment_spacey"] = df_supreme.translated.progress_apply(lambda x: TextBlob(str(x)).sentiment.polarity)

100%|█████████████████████████████████████████████████████████████████████████████| 5365/5365 [00:31<00:00, 170.01it/s]


## topic modelling

In [170]:
docs = df_supreme.translated.to_list()
representation_model = KeyBERTInspired()
topic_model = BERTopic(representation_model=representation_model, language="multilingual")
topics, probs = topic_model.fit_transform(docs)
topic_model.get_document_info(docs)["Name"]
df_supreme["topic_bert"] = topic_model.get_document_info(docs)["Name"].values

NameError: name 'KeyBERTInspired' is not defined

## bin