# Notebook Purpose

The purpose of this notebook is to combine all the feature extraction points into one CSV file so that we can validate all the data extracted programatically via regex or LLMs manually.

## Download Packages Required

- Install `pandas` package.

In [None]:
!pip install pandas

## Imports

In [None]:
import pandas as pd
from IPython.display import display

## Attributes

In [None]:
data_folder_path = "../../../data"
processed_data_folder_path = f"{data_folder_path}/processed"

local_news_articles_csv = f"{data_folder_path}/local_news_articles.csv"
police_press_releases_csv = f"{data_folder_path}/police_press_releases.csv"
og_prefix = "og_"

regex_dtime_news_articles_csv = f"{processed_data_folder_path}/road_accidents_with_datetime.csv"
regex_dtime_press_releases_csv = f"{processed_data_folder_path}/police_releases_with_datetime.csv"
regex_dtime_prefix = "regxdt_"

llm_news_articles_csv = f"{processed_data_folder_path}/llm_local_news_articles.csv"
llm_press_releases_csv = f"{processed_data_folder_path}/llm_press_releases.csv"
llm_prefix = "llm_"

# to-do: wait for Paul to extract town/street of both CSVs to include in feature exctration

# csv save file paths of combined dataframes
combined_news_articles_csv = f"{processed_data_folder_path}/combined_news_articles.csv"
combined_press_releases_csv = f"{processed_data_folder_path}/combined_press_releases.csv"

## Combine all DataFrames into one

Various different attempts have been made to extract features.

- Datetime feature extraction using Regex.
- Town/Street feature extraction using Reges.
- General feature extraction using LLM.

### Local News Articles

Combination of all dataframes for the local news articles.

#### Original CSV File

We select only the columns of importance and add `og_` prefix to the column names.

This way, when we join the DataFrames together, we will know from which DataFrame the column comes from.

In [None]:
articles_df = pd.read_csv(local_news_articles_csv)

articles_df = (
    articles_df[[
        "article_id",
        "url",
        "source_name",
        "source_url",
        "title",
        "subtitle",
        # "author_name", -> not interested in the name of the author
        "publish_date",
        "content",
        "top_image_url",
        "top_image_caption",
        "created_at",
        "tags",
        # "categories" -> always empty set, not interested in this column
    ]]
    .rename(columns={
        "article_id": "article_id",
        "url": f"{og_prefix}url",
        "source_name": f"{og_prefix}source_name",
        "source_url": f"{og_prefix}source_url",
        "title": f"{og_prefix}title",
        "subtitle": f"{og_prefix}subtitle",
        "publish_date": f"{og_prefix}publish_date",
        "content": f"{og_prefix}content",
        "top_image_url": f"{og_prefix}top_image_url",
        "top_image_caption": f"{og_prefix}top_image_caption",
        "created_at": f"{og_prefix}created_at",
        "tags": f"{og_prefix}tags",
    })
)

display(articles_df)

#### Datetime extraction with regex

We select only the columns of importance and add `regxdt_` prefix to the column names.

In [None]:
regex_dtime_articles_df = pd.read_csv(regex_dtime_news_articles_csv)

regex_dtime_articles_df = (
    regex_dtime_articles_df[[
        "article_id",
        "accident_datetime",
    ]]
    .rename(columns={
        "article_id": "article_id",
        "accident_datetime": f"{regex_dtime_prefix}accident_datetime",
    })
)

display(regex_dtime_articles_df)

#### Feature Extraction with LLM

We select only the columns of importance and add `llm_` prefix to the column names.

In [None]:
llm_articles_df = pd.read_csv(llm_news_articles_csv)

llm_articles_df = (
    llm_articles_df[[
        "id_column", # article_id
        "is_accident",
        "street",
        "city",
        "number_injured",
        "accident_severity",
        "drivers",
    ]]
    .rename(columns={
        "id_column": "article_id",
        "is_accident": f"{llm_prefix}is_accident",
        "street": f"{llm_prefix}street",
        "city": f"{llm_prefix}city",
        "number_injured": f"{llm_prefix}number_injured",
        "accident_severity": f"{llm_prefix}accident_severity",
        "drivers": f"{llm_prefix}drivers",
        "drivers": f"{llm_prefix}drivers",
    })
)

display(llm_articles_df)

#### Combined News Articles

Combine news articles DataFrames together.

In [None]:
combined_articles_df = (
    articles_df
    .merge(regex_dtime_articles_df, on="article_id", how="left")
    .merge(llm_articles_df, on="article_id", how="left")
)

display(combined_articles_df)

combined_articles_df.to_csv(combined_news_articles_csv)

### Police Press Releases

Combination of all dataframes for the police press releases.

#### Original CSV File

In [None]:
police_releases_df = pd.read_csv(police_press_releases_csv)
police_releases_df.insert(0, 'release_id', range(1, len(police_releases_df) + 1)) # use similar pre-processing used by Isaac to generate surrogate key

police_releases_df = (
    police_releases_df[[
        "release_id",
        "title",
        "content",
        "date_published",
        "date_modified",
    ]]
    .rename(columns={
        "release_id": "release_id",
        "title": f"{og_prefix}title",
        "content": f"{og_prefix}content",
        "date_published": f"{og_prefix}date_published",
        "date_modified": f"{og_prefix}date_modified",
    })
)

display(police_releases_df)

#### Datetime extraction with regex

In [None]:
regex_dtime_police_releases_df = pd.read_csv(regex_dtime_press_releases_csv)

regex_dtime_police_releases_df = (
    regex_dtime_police_releases_df[[
        "release_id",
        "accident_datetime",
    ]]
    .rename(columns={
        "release_id": "release_id",
        "accident_datetime": f"{regex_dtime_prefix}accident_datetime",
    })
)

display(regex_dtime_police_releases_df)

#### Feature Extraction with LLM

In [None]:
llm_police_releases_df = pd.read_csv(llm_press_releases_csv)

llm_police_releases_df = (
    llm_police_releases_df[[
        "id_column", # release_id
        "is_accident",
        "street",
        "city",
        "number_injured",
        "accident_severity",
        "drivers",
    ]]
    .rename(columns={
        "id_column": "release_id",
        "is_accident": f"{llm_prefix}is_accident",
        "street": f"{llm_prefix}street",
        "city": f"{llm_prefix}city",
        "number_injured": f"{llm_prefix}number_injured",
        "accident_severity": f"{llm_prefix}accident_severity",
        "drivers": f"{llm_prefix}drivers",
        "drivers": f"{llm_prefix}drivers",
    })
)

display(llm_police_releases_df)

#### Combined Police Press Releases

In [None]:
combined_police_releases_df = (
    police_releases_df
    .merge(regex_dtime_police_releases_df, on="release_id", how="left")
    .merge(llm_police_releases_df, on="release_id", how="left")
)

display(combined_police_releases_df)

combined_police_releases_df.to_csv(combined_press_releases_csv)