In [1]:
from pathlib import Path
import sys
BASE_DIR = str(Path().resolve().parents[0])
if BASE_DIR not in sys.path:
    sys.path.insert(0, BASE_DIR)
import src.news_nlp.config.paths as paths

from zipfile import ZipFile
import pandas as pd
import re

# Extract data

In [3]:
# Create raw data directory if it doesn't exist
paths.DIR_DATA_RAW.mkdir(parents=True, exist_ok=True)
# Unzip the compressed data file into the raw data directory
with ZipFile(paths.DATA_COMPRESSED, 'r') as zip_file:
    zip_file.extractall(paths.DIR_DATA_RAW)

# Load data

In [2]:
df_train = pd.read_csv(paths.DF_TRAIN, sep="\t")
df_test = pd.read_csv(paths.DF_TEST, sep="\t")

In [3]:
df_train

Unnamed: 0,title,content
0,"Singer, activist Joan Baez becomes Kennedy Cen...",Kennedy Center Honoree Joan Baez reflects on 6...
1,'Nothing but problems': Shipwreck tear-down en...,Demolition of a large cargo ship along the coa...
2,Report: At least 13 dead in Istanbul bombings,Report : At least 13 dead in Istanbul bombings...
3,Senate Republicans Pressure Joe Biden to Withd...,Several congressional Republicans are expressi...
4,The Unwelcome Return of the Real Purveyors of ...,With the mainstream media still obsessing abou...
...,...,...
89923,An Interview with Mark Blaxill on the Autism T...,NOTE : We 'll have an audio file to accompany ...
89924,Vietnam reconsiders methane-emitting rice amid...,Country says it can no longer be ‘ rice first ...
89925,Shaker furniture: Clean by design,The Hancock Shaker Village in western Massachu...
89926,"53 pot shop lottery winners announced, includi...","Following a year of acrimony and delays , stat..."


In [4]:
df_test

Unnamed: 0,title,content
0,Eye Opener: COVID on the rise again in parts o...,Eye Opener : COVID on the rise again in parts ...
1,Mall of America No Longer Delinquent on $1.4 B...,Mall of America has modified the terms of its ...
2,Judge asked to OK evidence of Ahmaud Arbery's ...,Attorneys for two Georgia men charged with cha...
3,Indian variant could threaten easing of restri...,"You thought it was all over , but it isn ’ t y..."
4,Carjackings surge in Chicago,There have been hundreds of carjackings in Chi...
...,...,...
10067,Sen. Cory Booker wants federal government to p...,Members of the U.S. Senate proposed a “ colleg...
10068,International outcry over Egypt’s conviction o...,"After a trial that dragged on for six months ,..."
10069,1/25: CBSN AM,U.S vaccine rollout uneven as cases soar in th...
10070,Young 12-Year-Olds Being Preyed Upon by Dodgy ...,Young 12-Year-Olds Being Preyed Upon by Dodgy ...


# EDA

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89928 entries, 0 to 89927
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    89923 non-null  object
 1   content  89800 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB


In [6]:
df_train[df_train['content'].isnull()]

Unnamed: 0,title,content
992,LIVE: High-Speed Police Chase In California; D...,
1949,Why nothing will stop Yemi Alade,
2109,REVEALED: Prosecution Asked Witness in Rittenh...,
2142,Arkansas governor signs law banning transgende...,
2759,Notable Deaths in 2021,
...,...,...
86379,WATCH: White House Briefing With Press Secreta...,
86696,"Bonus Edition: Richard Curtis, gerrymandering,...",
86937,WATCH: White House press secretary Jen Psaki h...,
88403,Notable Deaths in 2021,


In [7]:
df_train[df_train["title"].isnull()]

Unnamed: 0,title,content
10805,,"With the loss of Ed Arranga this week , I 've ..."
18384,,`` Texas Gov .\nGreg Abbott vows to defund sta...
33208,,"In today ’ s social media-dominated world , on..."
59599,,Donald Trump 's coup attempt — and especially ...
83475,,I published this on our Age of Autism Facebook...


In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10072 entries, 0 to 10071
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    10071 non-null  object
 1   content  10062 non-null  object
dtypes: object(2)
memory usage: 157.5+ KB


In [9]:
df_test[df_test['content'].isnull()]

Unnamed: 0,title,content
1803,Lives to remember: Those we've lost to coronav...,
1912,Watch live: Gov. Pritzker to announce a new pr...,
2900,Bleak futures fuel widespread protests by youn...,
3100,"Listen: Morano talks Green New Deal, climate l...",
4724,Public Reading of Scripture,
7580,Hillary Clinton Worried Cryptocurrencies Can “...,
8159,Victory for climate truth! Twitter has now pub...,
8574,"Celebrity attorney F. Lee Bailey, who defended...",
8785,Listen: Morano on Joe Piscopo Show on Biden ta...,
9062,Family of man killed during Kenosha protests f...,


In [10]:
df_test[df_test["title"].isnull()]

Unnamed: 0,title,content
1749,,The U.S. House Select Committee on the January...


In [11]:
# Remove rows with missing 'title' or 'content'
df_train = df_train.dropna(subset=['title', 'content']).reset_index(drop=True)
df_test = df_test.dropna(subset=['title', 'content']).reset_index(drop=True)

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89795 entries, 0 to 89794
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    89795 non-null  object
 1   content  89795 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB


In [13]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10061 entries, 0 to 10060
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    10061 non-null  object
 1   content  10061 non-null  object
dtypes: object(2)
memory usage: 157.3+ KB


# Basic preprocessing

In [14]:
# Ensure 'news_id' column exists, if not, create it
if "news_id" not in df_train.columns:
    df_train = df_train.reset_index(drop=True).reset_index().rename(columns={"index": "news_id"})
if "news_id" not in df_test.columns:
    df_test = df_test.reset_index(drop=True).reset_index().rename(columns={"index": "news_id"})

In [15]:
# Create a new 'text' column by combining 'title' and 'content'
df_train["text"] = df_train["title"].fillna("") + ". " + df_train["content"].fillna("")
df_test["text"] = df_test["title"].fillna("") + ". " + df_test["content"].fillna("")

In [16]:
def clean_text(text):
    """
    Clean the input text by removing unwanted characters and formatting.
    Args:
        text (str): The input text to be cleaned.
    Returns:
        str: The cleaned text.
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\n', ' ', text)   # Remove new line characters
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    return text.strip()

In [17]:
# Clean the 'text' column
df_train["text"] = df_train["text"].apply(clean_text)
df_test["text"] = df_test["text"].apply(clean_text)

In [18]:
df_train

Unnamed: 0,news_id,title,content,text
0,0,"Singer, activist Joan Baez becomes Kennedy Cen...",Kennedy Center Honoree Joan Baez reflects on 6...,"singer, activist joan baez becomes kennedy cen..."
1,1,'Nothing but problems': Shipwreck tear-down en...,Demolition of a large cargo ship along the coa...,'nothing but problems': shipwreck tear-down en...
2,2,Report: At least 13 dead in Istanbul bombings,Report : At least 13 dead in Istanbul bombings...,report: at least 13 dead in istanbul bombings....
3,3,Senate Republicans Pressure Joe Biden to Withd...,Several congressional Republicans are expressi...,senate republicans pressure joe biden to withd...
4,4,The Unwelcome Return of the Real Purveyors of ...,With the mainstream media still obsessing abou...,the unwelcome return of the real purveyors of ...
...,...,...,...,...
89790,89790,An Interview with Mark Blaxill on the Autism T...,NOTE : We 'll have an audio file to accompany ...,an interview with mark blaxill on the autism t...
89791,89791,Vietnam reconsiders methane-emitting rice amid...,Country says it can no longer be ‘ rice first ...,vietnam reconsiders methane-emitting rice amid...
89792,89792,Shaker furniture: Clean by design,The Hancock Shaker Village in western Massachu...,shaker furniture: clean by design. the hancock...
89793,89793,"53 pot shop lottery winners announced, includi...","Following a year of acrimony and delays , stat...","53 pot shop lottery winners announced, includi..."


In [19]:
df_test

Unnamed: 0,news_id,title,content,text
0,0,Eye Opener: COVID on the rise again in parts o...,Eye Opener : COVID on the rise again in parts ...,eye opener: covid on the rise again in parts o...
1,1,Mall of America No Longer Delinquent on $1.4 B...,Mall of America has modified the terms of its ...,mall of america no longer delinquent on $1.4 b...
2,2,Judge asked to OK evidence of Ahmaud Arbery's ...,Attorneys for two Georgia men charged with cha...,judge asked to ok evidence of ahmaud arbery's ...
3,3,Indian variant could threaten easing of restri...,"You thought it was all over , but it isn ’ t y...",indian variant could threaten easing of restri...
4,4,Carjackings surge in Chicago,There have been hundreds of carjackings in Chi...,carjackings surge in chicago. there have been ...
...,...,...,...,...
10056,10056,Sen. Cory Booker wants federal government to p...,Members of the U.S. Senate proposed a “ colleg...,sen. cory booker wants federal government to p...
10057,10057,International outcry over Egypt’s conviction o...,"After a trial that dragged on for six months ,...",international outcry over egypt’s conviction o...
10058,10058,1/25: CBSN AM,U.S vaccine rollout uneven as cases soar in th...,1/25: cbsn am. u.s vaccine rollout uneven as c...
10059,10059,Young 12-Year-Olds Being Preyed Upon by Dodgy ...,Young 12-Year-Olds Being Preyed Upon by Dodgy ...,young 12-year-olds being preyed upon by dodgy ...


# Export

In [20]:
# Save cleaned DataFrames to Parquet files
df_train.to_parquet(paths.DF_TRAIN_CLEAN, index=False)
df_test.to_parquet(paths.DF_TEST_CLEAN, index=False)