In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json

In [None]:
def get_csv(file_path):
    return pd.read_csv(file_path, delimiter=';').drop(columns=['Unnamed: 0'])

def load_dataframes(base_path, file_prefix, start_year, end_year):
    dataframes = []
    for year in tqdm(range(start_year, end_year + 1)):
        file_path = f"{base_path}/{file_prefix}_{year}.csv"
        df = get_csv(file_path)
        dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

base_path = "D:/UU/Kranten"

# Load dataframes for each newspaper
parool_df = load_dataframes(base_path, "Parool", 1945, 1995)
trouw_df = load_dataframes(base_path, "Trouw", 1946, 1995)
ld_df = load_dataframes(base_path, "LimburgsDagblad", 1945, 1994)
ac_df = load_dataframes(base_path, "AmigoeCuracao", 1946, 1995)

In [None]:
# Merge all dataframes into one big dataframe
all_news_df = pd.concat([parool_df, trouw_df, ld_df, ac_df], ignore_index=True)

In [None]:
# Clean up RAM
del parool_df, trouw_df, ld_df, ac_df

In [None]:
all_news_df.newspaper.unique()

In [None]:
all_news_df.genre.unique()

In [None]:
all_news_df = all_news_df[all_news_df['genre'] == 'artikel']

In [None]:
def check_for_book_review(text):
    if "blz" in str(text).lower(): 
        if 'ƒ' in str(text).lower():
            if 'isbn' in str(text).lower():
                return True
    return False

# Initialize the tqdm progress bar
tqdm.pandas()

# Apply the function to the 'Content' column with a progress bar and create a new column 'contains_book_review'
all_news_df['contains_book_review'] = all_news_df['Content'].progress_apply(check_for_book_review)

In [None]:
len(all_news_df[all_news_df['contains_book_review'] == 1])

In [None]:
all_news_df[all_news_df['contains_book_review'] == 1].groupby('newspaper').size()

In [None]:
# Show example
all_news_df[all_news_df['contains_book_review'] == 1].iloc[0].Content

In [None]:
# Filter the dataframe
filtered_parool_df = all_news_df[(all_news_df['contains_book_review'] == 1) & (all_news_df['newspaper'] == 'Parool')]
filtered_trouw_df = all_news_df[(all_news_df['contains_book_review'] == 1) & (all_news_df['newspaper'] == 'Trouw')]

In [None]:
print(f"Number of samples from Parool: {len(filtered_parool_df)}, from Trouw: {len(filtered_trouw_df)}")

In [None]:
# Prepare data for Label Studio
reviews_parool = [{"text": text} for text in filtered_parool_df['Content'].to_list()]
reviews_trouw = [{"text": text} for text in filtered_trouw_df['Content'].to_list()]


# Save data to a JSON file
with open('D:/UU/parool_sampled_reviews.json', 'w') as f:
    json.dump(reviews_parool, f, indent=2)
with open('D:/UU/trouw_sampled_reviews.json', 'w') as f:
    json.dump(reviews_trouw, f, indent=2)