## NLP Final - Notebook #1 (Article Cleanup)
Author: Natalie Kim

Quarter: Winter 2025

### Loading initial data

In [1]:
# load data
import pandas as pd

# define url
url = 'https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet'

df = pd.read_parquet(url, engine='pyarrow')

In [2]:
df.head(10)

Unnamed: 0,url,date,language,title,text
0,http://www.dataweek.co.za/21690r,2024-04-05,en,Flash for AI - 28 March 2024 - EBV Electrolink...,\nFlash for AI - 28 March 2024 - EBV Electroli...
1,http://www.mysmartrend.com/news-briefs/technic...,2020-04-17,en,Cr Bard Inc Has Returned 48.9% Since SmarTrend...,\n\nCr Bard Inc Has Returned 48.9% Since SmarT...
2,https://abcnews.go.com/Entertainment/video/ai-...,2024-04-06,en,Video AI used for hiring and firing focus of b...,\n\nVideo AI used for hiring and firing focus ...
3,https://boingboing.net/2020/08/14/cory-doctoro...,2020-08-14,en,Cory Doctorow experiments with AI writing part...,\nCory Doctorow experiments with AI writing pa...
4,https://cio.economictimes.indiatimes.com/news/...,2023-05-31,en,"Artificial Intelligence: Top AI CEOs, experts ...","\n\n\nArtificial Intelligence: Top AI CEOs, ex..."
5,https://citylife.capetown/haw/uncategorized/is...,2023-12-02,en,"Ua lawa anei ka $60,000 e noho ai ma Vancouver?","Ua lawa anei ka $60,000 e noho ai ma Vancouve..."
6,https://citylife.capetown/id/uncategorized/wil...,2023-12-11,en,Akankah AI mengambil alih dunia?,Akankah AI mengambil alih dunia? \n \n\nL...
7,https://citylife.capetown/iw/uncategorized/cou...,2023-12-10,en,האם AI ובן אדם יכולים להתאהב?,האם AI ובן אדם יכולים להתאהב? \n \n\nעבור...
8,https://citylife.capetown/sl/uncategorized/can...,2023-12-13,en,Can AI control humans in future?,Can AI control humans in future? \n \n\nP...
9,https://citylife.capetown/technology/microclou...,2023-09-09,en,MicroCloud Hologram Develops Holographic Virtu...,MicroCloud Hologram Develops Holographic Virt...


In [3]:
df.shape

(199707, 5)

In [4]:
# filter for only english articles
df = df[df['language'] == 'en']

In [5]:
# get subset of data (10%) for regex testing
df_sub = df.sample(frac=0.1, random_state=1)
df_sub.shape

(19971, 5)

In [6]:
# export subset of data
df_sub.to_csv('final_data_subset.csv', index=False)

#### load the subset of data

In [None]:
df_sub = pd.read_csv('final_data_subset.csv')

### Clean up of Titles

In [7]:
import re

In [8]:
pd.set_option('display.max_colwidth', None)
df['title'].head(10)

0                                      Flash for AI - 28 March 2024 - EBV Electrolink - Dataweek
1        Cr Bard Inc Has Returned 48.9% Since SmarTrend Recommendation (BCR)  | Comtex SmarTrend
2    Video AI used for hiring and firing focus of book by journalist Hilke Schellmann - ABC News
3                      Cory Doctorow experiments with AI writing partner Sudowrite / Boing Boing
4       Artificial Intelligence: Top AI CEOs, experts raise 'risk of extinction' from AI, ET CIO
5                                                Ua lawa anei ka $60,000 e noho ai ma Vancouver?
6                                                               Akankah AI mengambil alih dunia?
7                                                                  האם AI ובן אדם יכולים להתאהב?
8                                                               Can AI control humans in future?
9        MicroCloud Hologram Develops Holographic Virtual Digital Human Using ChatGPT Technology
Name: title, dtype: object

In [22]:
def title_cleaning(title):
    '''
    Cleans article titles by:
    - Removing newline & tab characters
    - Removing special characters and punctuation
    - Keeping only English letters, numbers, and spaces
    '''
    # remove tab and newline characters
    title = title.replace('\n', ' ').replace('\t', ' ')

    # remove non-english characters and punctuation
    title = re.sub(r'[^A-Za-z0-9.% ]+', '', title)

    # removing standalone . characters
    title = re.sub(r'(?<!\d)\.(?!\d)', ' ', title)

    # remove excess white spaces
    title = re.sub(r'\s+', ' ', title).strip()

    return title

In [23]:
pd.reset_option('display.max_colwidth')
# apply to title column of full data
df['title_clean'] = df['title'].apply(title_cleaning)
df.head(5)

# apply to title column of subset
#df_sub['title_clean'] = df_sub['title'].apply(title_cleaning)
#df_sub['title_clean'].head(10)

Unnamed: 0,url,date,language,title,text,title_clean,title_clean_lower,text_clean_ner,text_clean_topic
0,http://www.dataweek.co.za/21690r,2024-04-05,en,Flash for AI - 28 March 2024 - EBV Electrolink...,\nFlash for AI - 28 March 2024 - EBV Electroli...,Flash for AI 28 March 2024 EBV Electrolink Dat...,flash for ai 28 march 2024 ebv electrolink dat...,Flash for AI 28 March 2024 EBV Electrolink Dat...,flash for ai march 2024 ebv electrolink datawe...
1,http://www.mysmartrend.com/news-briefs/technic...,2020-04-17,en,Cr Bard Inc Has Returned 48.9% Since SmarTrend...,\n\nCr Bard Inc Has Returned 48.9% Since SmarT...,Cr Bard Inc Has Returned 48.9% Since SmarTrend...,cr bard inc has returned 489 since smartrend r...,Cr Bard Inc Has Returned 48.9% Since Recommend...,cr bard inc has returned . since recommendatio...
2,https://abcnews.go.com/Entertainment/video/ai-...,2024-04-06,en,Video AI used for hiring and firing focus of b...,\n\nVideo AI used for hiring and firing focus ...,Video AI used for hiring and firing focus of b...,video ai used for hiring and firing focus of b...,Video AI used for hiring and firing focus of b...,video ai used for hiring and firing focus of b...
3,https://boingboing.net/2020/08/14/cory-doctoro...,2020-08-14,en,Cory Doctorow experiments with AI writing part...,\nCory Doctorow experiments with AI writing pa...,Cory Doctorow experiments with AI writing part...,cory doctorow experiments with ai writing part...,Cory Doctorow experiments with AI writing part...,cory doctorow experiments with ai writing part...
4,https://cio.economictimes.indiatimes.com/news/...,2023-05-31,en,"Artificial Intelligence: Top AI CEOs, experts ...","\n\n\nArtificial Intelligence: Top AI CEOs, ex...",Artificial Intelligence Top AI CEOs experts ra...,artificial intelligence top ai ceos experts ra...,"Artificial Intelligence: Top AI CEOs, experts ...",artificial intelligence top ai ce os experts r...


In [24]:
# title_clean_lower for topic modeling
df['title_clean_lower'] = df['title_clean'].str.lower()
df.head(5)

# title_clean_lower for topic modeling (subset)
#df_sub['title_clean_lower'] = df_sub['title_clean'].str.lower()
#df_sub['title_clean_lower'].head(10)

Unnamed: 0,url,date,language,title,text,title_clean,title_clean_lower,text_clean_ner,text_clean_topic
0,http://www.dataweek.co.za/21690r,2024-04-05,en,Flash for AI - 28 March 2024 - EBV Electrolink...,\nFlash for AI - 28 March 2024 - EBV Electroli...,Flash for AI 28 March 2024 EBV Electrolink Dat...,flash for ai 28 march 2024 ebv electrolink dat...,Flash for AI 28 March 2024 EBV Electrolink Dat...,flash for ai march 2024 ebv electrolink datawe...
1,http://www.mysmartrend.com/news-briefs/technic...,2020-04-17,en,Cr Bard Inc Has Returned 48.9% Since SmarTrend...,\n\nCr Bard Inc Has Returned 48.9% Since SmarT...,Cr Bard Inc Has Returned 48.9% Since SmarTrend...,cr bard inc has returned 48.9% since smartrend...,Cr Bard Inc Has Returned 48.9% Since Recommend...,cr bard inc has returned . since recommendatio...
2,https://abcnews.go.com/Entertainment/video/ai-...,2024-04-06,en,Video AI used for hiring and firing focus of b...,\n\nVideo AI used for hiring and firing focus ...,Video AI used for hiring and firing focus of b...,video ai used for hiring and firing focus of b...,Video AI used for hiring and firing focus of b...,video ai used for hiring and firing focus of b...
3,https://boingboing.net/2020/08/14/cory-doctoro...,2020-08-14,en,Cory Doctorow experiments with AI writing part...,\nCory Doctorow experiments with AI writing pa...,Cory Doctorow experiments with AI writing part...,cory doctorow experiments with ai writing part...,Cory Doctorow experiments with AI writing part...,cory doctorow experiments with ai writing part...
4,https://cio.economictimes.indiatimes.com/news/...,2023-05-31,en,"Artificial Intelligence: Top AI CEOs, experts ...","\n\n\nArtificial Intelligence: Top AI CEOs, ex...",Artificial Intelligence Top AI CEOs experts ra...,artificial intelligence top ai ceos experts ra...,"Artificial Intelligence: Top AI CEOs, experts ...",artificial intelligence top ai ce os experts r...


### Article text cleaning

In [12]:
pd.set_option('display.max_colwidth', 500)

df['text'].head(5)

0    \nFlash for AI - 28 March 2024 - EBV Electrolink - Dataweek\nHome\nAbout us\nBack issues / E-book / PDF\nSubscribe\nAdvertise\n\n \n\nEMP Handbook\n\nCategories\n\n▸ Editor's Choice\n▸ Multimedia, Videos\n▸ AI & ML\n▸ Analogue, Mixed Signal, LSI\n▸ Circuit & System Protection\n▸ Computer/Embedded Technology\n▸ Design Automation\n▸ DSP, Micros & Memory\n▸ Edge Computing & IIoT\n▸ Electronics Technology\n▸ Enclosures, Racks, Cabinets & Panel Products\n▸ Events\n▸ Interconnection\n▸ Manufacturi...
1    \n\nCr Bard Inc Has Returned 48.9% Since SmarTrend Recommendation (BCR)  | Comtex SmarTrend\n \nOWL LOGIN  /  \n               ADVISOR LOGIN\n\nHome\nAdvantages\nProducts\n\nNews &amp Analysis\n\n\nResources\nTestimonials\n\n *Supporting investors at home - 60% off SmarTrend Products - Coupon Code: COVID19 \nReturn to Headlines\nCr Bard Inc Has Returned 48.9% Since SmarTrend Recommendation (BCR) \nWritten on Fri, 04/17/2020 - 1:08am\n By Shiri Gupta\nSmarTrend identified an Uptrend for

In [13]:
import multiprocessing as mp

#### Article Cleaing for NER/sentiment analysis

In [14]:
# function to clean text specifically for NER/sentiment analysis
def article_text_cleaning_NER_SA(text):
    '''
    Cleans article text, but keeps:
    - contractions & possessives
    - preserves casing
    
    '''
     # remove URLs
    text = re.sub(r'http\S+', ' ', text)

    # remove html tags (if there are any)
    text = re.sub(r'<.*?>', ' ', text)

    # remove non-english characters but keep important punctuation
    text = re.sub(r"[^A-Za-z0-9.,!?;:'\"()\-\n\t%]+", ' ', text)  # Keeps: . , ! ? ; : ' " ( ) - %

    # remove the new lines at the beginning of the text
    text = re.sub(r'^[\s\n\t]+', '', text)

    # adding \n to get all web crawl remnants by converting single \n to double
    text = re.sub(r'(?<!\n)\n(?!\n)', '\n\n', text)

    # removing sections SURROUNDED by newlines
    text = re.sub(r'[\n\t](?:[A-Z0-9][A-Za-z0-9\s&.,:!?_-]{3,}?)[\n\t]', ' ', text, flags=re.MULTILINE | re.DOTALL)

    # remove /n and /t
    text = re.sub(r'\n|\t', ' ', text)

    # remove phone numbers
    text = re.sub(r'\b\d{3}-\d{3}-\d{4}\b', '', text)

    # remove dates
    text = re.sub(
    r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)?\s*'
    r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|'
    r'January|February|March|April|May|June|July|August|September|October|November|December)\s*'
    r'\d{1,2}(?:st|nd|rd|th)?,?\s*\d{4}\b|'
    r'\b\d{1,2}/\d{1,2}/\d{2,4}\b|'
    r'\b\d{4}-\d{2}-\d{2}\b',  # Handles YYYY-MM-DD format
    '', text
    )

    # remove times
    text = re.sub(r'\b\d{1,2}:\d{2}\s?(?:AM|PM)?(?:\s[A-Z]{2,4})?\b|\b\d{1,2}\s?(?:AM|PM)(?:\s-\s\d{1,2}\s?(?:AM|PM))?(?:\s[A-Z]{2,4})?\b', '', text)

    # separating words with all caps from other words
    text = re.sub(r'([a-z])([A-Z]{3,})', r'\1 \2', text)
    text = re.sub(r'([A-Z]{3,})([A-Z][a-z])', r'\1 \2', text)
    
    # separating words from numbers
    text = re.sub(r'(\d+)([a-zA-Z]+)', r'\1 \2', text)
    text = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', text)

    # removing words with mixed lowercase and uppercase when more than 2 words are uppercase in a row
    text = re.sub(r'\b\w*[A-Z][a-z]{3,}[A-Z]\w*\b', '', text)

    # removing paranetheses and hyphens that are surrounded by space on either side
    text = re.sub(r'[()]', '', text)
    text = re.sub(r'(?<=\s)-(?=\s)', '', text)

    # remove excess white spaces
    text = re.sub(r'\s+', ' ', text).strip()
   
    return text

#### Applying Cleaning for NER and Sentiment Analysis

In [15]:
def ner_sent_parallel_cleaning(text_series):
    with mp.Pool(mp.cpu_count()) as pool:
        results = pool.map(article_text_cleaning_NER_SA, text_series)
    return results

In [16]:
# applying NER/Sentiment Analysis cleaning to the full text
if __name__ == "__main__":
    mp.set_start_method('fork', force=True)

    df['text_clean_ner'] = ner_sent_parallel_cleaning(df['text'])

# applying NER/Sentiment Analysis cleaning to the subset of text
# if __name__ == "__main__":
#     mp.set_start_method('fork', force=True)

#     df_sub['text_clean_ner'] = ner_sent_parallel_cleaning(df_sub['text'])

In [17]:
pd.reset_option('display.max_colwidth')
df.head(5)

# df_sub['text_clean_ner'].head(5)

Unnamed: 0,url,date,language,title,text,title_clean,title_clean_lower,text_clean_ner
0,http://www.dataweek.co.za/21690r,2024-04-05,en,Flash for AI - 28 March 2024 - EBV Electrolink...,\nFlash for AI - 28 March 2024 - EBV Electroli...,Flash for AI 28 March 2024 EBV Electrolink Dat...,flash for ai 28 march 2024 ebv electrolink dat...,Flash for AI 28 March 2024 EBV Electrolink Dat...
1,http://www.mysmartrend.com/news-briefs/technic...,2020-04-17,en,Cr Bard Inc Has Returned 48.9% Since SmarTrend...,\n\nCr Bard Inc Has Returned 48.9% Since SmarT...,Cr Bard Inc Has Returned 489 Since SmarTrend R...,cr bard inc has returned 489 since smartrend r...,Cr Bard Inc Has Returned 48.9% Since Recommend...
2,https://abcnews.go.com/Entertainment/video/ai-...,2024-04-06,en,Video AI used for hiring and firing focus of b...,\n\nVideo AI used for hiring and firing focus ...,Video AI used for hiring and firing focus of b...,video ai used for hiring and firing focus of b...,Video AI used for hiring and firing focus of b...
3,https://boingboing.net/2020/08/14/cory-doctoro...,2020-08-14,en,Cory Doctorow experiments with AI writing part...,\nCory Doctorow experiments with AI writing pa...,Cory Doctorow experiments with AI writing part...,cory doctorow experiments with ai writing part...,Cory Doctorow experiments with AI writing part...
4,https://cio.economictimes.indiatimes.com/news/...,2023-05-31,en,"Artificial Intelligence: Top AI CEOs, experts ...","\n\n\nArtificial Intelligence: Top AI CEOs, ex...",Artificial Intelligence Top AI CEOs experts ra...,artificial intelligence top ai ceos experts ra...,"Artificial Intelligence: Top AI CEOs, experts ..."


#### Cleaning for Topic Modeling

In [25]:
# function to clean text for topic modeling
def article_text_cleaning_TM(text):
    '''
    Cleans article text for topic modeling:
    - removing posessives
    - lower casing everything
    
    '''
    # remove URLs
    text = re.sub(r'http\S+', ' ', text)

    # remove html tags (if there are any)
    text = re.sub(r'<.*?>', ' ', text)

    # remove non-english characters and common punctuation not needed for removing web crawl remnants
    text = re.sub(r"[^A-Za-z0-9.%\\'/<>:_\n\t-]+", ' ', text)

    # removing standalone dots but preserving decimals
    text = re.sub(r'(?<!\d)\.(?!\d)', ' ', text)

    # remove the new lines at the beginning of the text
    text = re.sub(r'^[\s\n\t]+', '', text)

    # adding \n to get all web crawl remnants by converting single \n to double
    text = re.sub(r'(?<!\n)\n(?!\n)', '\n\n', text)

    # removing sections SURROUNDED by newlines
    text = re.sub(r'[\n\t](?:[A-Z0-9][A-Za-z0-9\s&.,:!?_-]{3,}?)[\n\t]', ' ', text, flags=re.MULTILINE | re.DOTALL)

    # remove /n and /t
    text = re.sub(r'\n|\t', ' ', text)

    # remove phone numbers
    text = re.sub(r'\b\d{3}-\d{3}-\d{4}\b', '', text)

    # remove dates
    text = re.sub(
    r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)?\s*'
    r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|'
    r'January|February|March|April|May|June|July|August|September|October|November|December)\s*'
    r'\d{1,2}(?:st|nd|rd|th)?,?\s*\d{4}\b|'
    r'\b\d{1,2}/\d{1,2}/\d{2,4}\b|'
    r'\b\d{4}-\d{2}-\d{2}\b',  # Handles YYYY-MM-DD format
    '', text
    )

    # remove times
    text = re.sub(r'\b\d{1,2}:\d{2}\s?(?:AM|PM)?(?:\s[A-Z]{2,4})?\b|\b\d{1,2}\s?(?:AM|PM)(?:\s-\s\d{1,2}\s?(?:AM|PM))?(?:\s[A-Z]{2,4})?\b', '', text)

    # remove remaining punctuation: / < >:_ '
    text = re.sub(r'[\/<>:_\'-]', ' ', text)

    # separating words from numbers
    text = re.sub(r'(\d+)([a-zA-Z]+)', r'\1 \2', text)
    text = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', text)

    # separating words with all caps from other words
    text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', text)
    text = re.sub(r'([A-Z]{2,})([A-Z][a-z])', r'\1 \2', text)

    # remove single 's' from the possessives
    text = re.sub(r"\b's\b|\bs\b", '', text)

    # removing words with mixed lowercase and uppercase when more than 2 words are uppercase in a row
    text = re.sub(r'\b(?:[A-Z][a-z]+){2,}\b', '', text)
    text = re.sub(r'\b(?!AI|ML|NLP|CV|LLM|GAN|LSTM|BERT|GPT|CNN|RNN)[A-Z][a-z]+[A-Z]\w*\b', '', text)  # Keeps all-uppercase words
    
    # lower case everything
    text = text.lower()

    # remove excess white spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

#### Applying Cleaning for Topic Modeling

In [19]:
def topic_modeling_parallel_cleaning(text_series):
    with mp.Pool(mp.cpu_count()) as pool:
        results = pool.map(article_text_cleaning_TM, text_series)
    return results

In [26]:
# cleaning full text
if __name__ == "__main__":
    mp.set_start_method('fork', force=True)

    df['text_clean_topic'] = topic_modeling_parallel_cleaning(df['text'])

# cleaning on subset of text
# if __name__ == "__main__":
#     mp.set_start_method('fork', force=True)

#     df_sub['text_clean_lower'] = topic_modeling_parallel_cleaning(df_sub['text'])

In [27]:
df.head(5)

Unnamed: 0,url,date,language,title,text,title_clean,title_clean_lower,text_clean_ner,text_clean_topic
0,http://www.dataweek.co.za/21690r,2024-04-05,en,Flash for AI - 28 March 2024 - EBV Electrolink...,\nFlash for AI - 28 March 2024 - EBV Electroli...,Flash for AI 28 March 2024 EBV Electrolink Dat...,flash for ai 28 march 2024 ebv electrolink dat...,Flash for AI 28 March 2024 EBV Electrolink Dat...,flash for ai 28 march 2024 ebv electrolink dat...
1,http://www.mysmartrend.com/news-briefs/technic...,2020-04-17,en,Cr Bard Inc Has Returned 48.9% Since SmarTrend...,\n\nCr Bard Inc Has Returned 48.9% Since SmarT...,Cr Bard Inc Has Returned 48.9% Since SmarTrend...,cr bard inc has returned 48.9% since smartrend...,Cr Bard Inc Has Returned 48.9% Since Recommend...,cr bard inc has returned 48.9% since recommend...
2,https://abcnews.go.com/Entertainment/video/ai-...,2024-04-06,en,Video AI used for hiring and firing focus of b...,\n\nVideo AI used for hiring and firing focus ...,Video AI used for hiring and firing focus of b...,video ai used for hiring and firing focus of b...,Video AI used for hiring and firing focus of b...,video ai used for hiring and firing focus of b...
3,https://boingboing.net/2020/08/14/cory-doctoro...,2020-08-14,en,Cory Doctorow experiments with AI writing part...,\nCory Doctorow experiments with AI writing pa...,Cory Doctorow experiments with AI writing part...,cory doctorow experiments with ai writing part...,Cory Doctorow experiments with AI writing part...,cory doctorow experiments with ai writing part...
4,https://cio.economictimes.indiatimes.com/news/...,2023-05-31,en,"Artificial Intelligence: Top AI CEOs, experts ...","\n\n\nArtificial Intelligence: Top AI CEOs, ex...",Artificial Intelligence Top AI CEOs experts ra...,artificial intelligence top ai ceos experts ra...,"Artificial Intelligence: Top AI CEOs, experts ...",artificial intelligence top ai ce os experts r...


In [28]:
# checking number of empty rows from df
print(df['title_clean'].isna().sum())
print(df['text_clean_ner'].isna().sum())
print(df['text_clean_topic'].isna().sum())


0
0
0


In [29]:
# checking for empty strings or whitespace
print((df['title_clean'].str.strip() == '').sum())
print((df['title_clean'].str.strip() == '').sum())
print((df['title_clean'].str.strip() == '').sum())

0
0
0


In [30]:
# cleaned data frame
df_filtered = df.drop(columns=['language','title','text'])
df_filtered.head(5)

Unnamed: 0,url,date,title_clean,title_clean_lower,text_clean_ner,text_clean_topic
0,http://www.dataweek.co.za/21690r,2024-04-05,Flash for AI 28 March 2024 EBV Electrolink Dat...,flash for ai 28 march 2024 ebv electrolink dat...,Flash for AI 28 March 2024 EBV Electrolink Dat...,flash for ai 28 march 2024 ebv electrolink dat...
1,http://www.mysmartrend.com/news-briefs/technic...,2020-04-17,Cr Bard Inc Has Returned 48.9% Since SmarTrend...,cr bard inc has returned 48.9% since smartrend...,Cr Bard Inc Has Returned 48.9% Since Recommend...,cr bard inc has returned 48.9% since recommend...
2,https://abcnews.go.com/Entertainment/video/ai-...,2024-04-06,Video AI used for hiring and firing focus of b...,video ai used for hiring and firing focus of b...,Video AI used for hiring and firing focus of b...,video ai used for hiring and firing focus of b...
3,https://boingboing.net/2020/08/14/cory-doctoro...,2020-08-14,Cory Doctorow experiments with AI writing part...,cory doctorow experiments with ai writing part...,Cory Doctorow experiments with AI writing part...,cory doctorow experiments with ai writing part...
4,https://cio.economictimes.indiatimes.com/news/...,2023-05-31,Artificial Intelligence Top AI CEOs experts ra...,artificial intelligence top ai ceos experts ra...,"Artificial Intelligence: Top AI CEOs, experts ...",artificial intelligence top ai ce os experts r...


### Filtering for non-related articles

In [31]:
# defining AI keywords
ai_keywords = [
    "artificial intelligence", "ai", "machine learning", "deep learning", "ml",
    "neural network", "automation", "generative ai", "chatbot",
    "large language model", "nlp", "natural language processing",
    "data science", "autonomous", "robotics", "computer vision",
    "reinforcement learning", "supervised learning", "unsupervised learning",
    "ai ethics", "llm", "transformer model", "openai", "gpt", "bert",
    "tensorflow", "pytorch"
]

In [32]:
# use mask to check for articles without ai keywords
mask = ~df_filtered['text_clean_topic'].str.contains('|'.join(ai_keywords), case=False, na=False)
df_non_ai = df_filtered[mask]

df_non_ai.shape[0]

816

In [33]:
pd.reset_option('display.max_colwidth')
df_non_ai.head(5)

Unnamed: 0,url,date,title_clean,title_clean_lower,text_clean_ner,text_clean_topic
30,https://musescore.com/user/46317698/scores/783...,2022-05-08,alm 18 Claude Goudimel Genevan Psalter Sheet m...,alm 18 claude goudimel genevan psalter sheet m...,alm 18 Claude Goudimel Genevan Psalter Sheet m...,alm 18 claude goudimel genevan psalter sheet m...
379,https://www.zeffy.com/fr-CA/ticketing/0704575d...,2024-05-31,2024 Claude Hall 5K and Fun Day,2024 claude hall 5k and fun day,2024 Claude Hall 5 K and Fun Day,2024 claude hall 5 k and fun day
1165,https://musescore.com/user/39593079/scores/156...,2024-04-21,Don Quichotte Et Sancho Sheet music for Piano ...,don quichotte et sancho sheet music for piano ...,"Don Quichotte Et Sancho Sheet music for Piano,...",don quichotte et sancho sheet music for piano ...
1166,https://musescore.com/user/66884440/scores/116...,2023-08-14,Rverie Claude Debussy Sheet music for Flute Gu...,rverie claude debussy sheet music for flute gu...,"R verie Claude Debussy Sheet music for Flute, ...",r verie claude debussy sheet music for flute g...
1355,https://it-online.co.za/2021/07/05/data-scient...,2021-07-06,Data Scientist Cape Town or Johannesburg at Ca...,data scientist cape town or johannesburg at ca...,Data Scientist Cape Town or Johannesburg at Ca...,data scientist cape town or johannesburg at ca...


In [34]:
# removing non_ai articles from df_filtered
df_final = df_filtered[~mask]

df_final.shape[0]

198891

In [35]:
# exporting clean data
df_final.to_csv('final_data_clean.csv', index=False)

In [36]:
# and parquet just in case
import os
file_path = 'final_data_clean.parquet'
if os.path.exists(file_path):
    os.remove(file_path)

df_final.to_parquet('final_data_clean.parquet', index=False, engine='pyarrow')