In [23]:
import pandas as pd

In [24]:
%%time

# df_news_final_project_whole = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
# df_news_final_project_whole.shape

df_news_final_project_whole = pd.read_parquet('./data/news_final_project.parquet')

CPU times: user 7.69 s, sys: 10.4 s, total: 18.1 s
Wall time: 17.8 s


In [25]:
# df_news_final_project_whole.to_parquet("./data/news_final_project.parquet")

In [26]:
df_news_final_project_whole.head()

Unnamed: 0,url,date,language,title,text
0,http://galusaustralis.com/2020/02/486473/legal...,2020-02-26,en,LegalTech Artificial Intelligence Market 2019 ...,LegalTech Artificial Intelligence Market 2019 ...
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,en,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...
2,http://www.agoravox.it/Covid-19-un-messaggio-d...,2020-03-13,en,Covid-19: un messaggio dai ricercatori italian...,\n\n\nCovid-19: un messaggio dai ricercatori i...
3,http://www.desototimes.com/news/child-actor-cl...,2022-06-09,en,Child actor Claude Jarman Jr. looks back on hi...,\nChild actor Claude Jarman Jr. looks back on ...
4,http://www.millenniumpost.in/big-stories/ai-ex...,2020-08-07,en,Two dead as AI Express flight skids off Kozhik...,Two dead as AI Express flight skids off Kozhik...


In [27]:
# Sample for testing
# df_news_final_project = df_news_final_project.sample(frac=0.001)
# df_news_final_project = df_news_final_project_whole.head(20)

In [30]:
%%time

import pandas as pd
import numpy as np
import re
import spacy
from langdetect import detect, LangDetectException

url_re = re.compile(r'http\S+')
html_tag_re = re.compile(r'<[^>]+>')
end_punctuation_re = re.compile(r'[\.\?\!]["\']?$')
ai_re = re.compile(r'\b(artificial intelligence)\b', re.IGNORECASE)
ml_re = re.compile(r'\b(machine learning)\b', re.IGNORECASE)

nlp = spacy.load("en_core_web_sm", disable=["ner", "tagger", "parser", "attribute_ruler", "lemmatizer"])
nlp.enable_pipe("senter")

def preprocess_chunk(chunk):
    chunk = url_re.sub('', chunk)
    chunk = html_tag_re.sub(' ', chunk)
    chunk = chunk.replace('“', '"').replace('”', '"').replace("‘", "'").replace("’", "'")
    chunk = ai_re.sub('AI', chunk)
    chunk = ml_re.sub('ML', chunk)
    chunk = re.sub(r'\b\w{21,}\b', '', chunk)
    chunk = re.sub(r'\s+', ' ', chunk).strip()
    return chunk

def is_complete_sentence(sentence):
    forbidden_words_re = re.compile(r'\b(copyrights?|cookies?|powered)\b', re.IGNORECASE)
    try:
        return (len(sentence.split()) >= 4 and
                end_punctuation_re.search(sentence) and
                not any(char in sentence for char in [":", "-", "–", "|"]) and
                not forbidden_words_re.search(sentence) and
                detect(sentence) == 'en')
    except LangDetectException:
        return False

def clean_article_text(article_text, article_title):
    processed_title = preprocess_chunk(article_title)
    valid_sentences = [processed_title]
    chunks = re.split(r'\n|/|\\', article_text)
    for chunk in chunks:
        chunk = preprocess_chunk(chunk)
        if len(chunk.split()) >= 10:
            doc = nlp(chunk)
            for sent in doc.sents:
                if is_complete_sentence(sent.text):
                    valid_sentences.append(sent.text)
    return ' '.join(valid_sentences)

keywords = ['AI', 'ML', 'Artificial Intelligence', 'Machine Learning', 'data', 'Deep Learning', 'Neural Network', 'NLP', 'Natural Language Processing', 'Computer Vision', 'Robotics', 'Analytics', 'Business Intelligence']
keywords_re = re.compile('|'.join(keywords), re.IGNORECASE)

def contains_keywords(text):
    return bool(keywords_re.search(text))

df_filtered = df_news_final_project_whole[df_news_final_project_whole['text'].apply(contains_keywords)].copy()
df_filtered['token_count'] = df_filtered['text'].apply(lambda x: len(x.split()))

Q1 = df_filtered['token_count'].quantile(0.25)
Q3 = df_filtered['token_count'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_filtered_final = df_filtered[(df_filtered['token_count'] >= lower_bound) & (df_filtered['token_count'] <= upper_bound)]
df_filtered_final['cleaned_text'] = df_filtered_final.apply(lambda row: clean_article_text(row['text'], row['title']), axis=1)


# import pandas as pd
# import numpy as np
# import re
# import spacy
# from langdetect import detect_langs

# # Compile regular expressions for improved performance
# url_re = re.compile(r'http\S+')
# html_tag_re = re.compile(r'<[^>]+>')
# end_punctuation_re = re.compile(r'[\.\?\!]["\']?$')

# # Patterns for case-insensitive replacement of specific bigrams
# ai_re = re.compile(r'\b(artificial intelligence)\b', re.IGNORECASE)
# ml_re = re.compile(r'\b(machine learning)\b', re.IGNORECASE)

# # !python -m spacy download en_core_web_sm

# # Load spaCy model with reduced pipeline components for efficiency
# nlp = spacy.load("en_core_web_sm", disable=["ner", "tagger", "parser", "attribute_ruler", "lemmatizer"])
# nlp.enable_pipe("senter")

# def preprocess_chunk(chunk):
#     chunk = url_re.sub('', chunk)  # Remove URLs
#     chunk = html_tag_re.sub(' ', chunk)  # Remove HTML tags
#     chunk = chunk.replace('“', '"').replace('”', '"').replace("‘", "'").replace("’", "'")
#     chunk = ai_re.sub('AI', chunk)  # Replace "artificial intelligence" with "AI"
#     chunk = ml_re.sub('ML', chunk)  # Replace "machine learning" with "ML"
#     chunk = re.sub(r'\b\w{21,}\b', '', chunk)  # Remove words longer than 20 characters
#     chunk = re.sub(r'\s+', ' ', chunk).strip()  # Normalize spaces
#     return chunk

# def is_complete_sentence(sentence):
#     forbidden_words_re = re.compile(r'\b(copyrights?|cookies?|powered)\b', re.IGNORECASE)
#     return (len(sentence.split()) >= 4 and
#             end_punctuation_re.search(sentence) and
#             not any(char in sentence for char in [":", "-", "–", "|"]) and
#             not forbidden_words_re.search(sentence) and
#             detect_langs(sentence)[0].lang == 'en')

# def clean_article_text(article_text, article_title):
#     processed_title = preprocess_chunk(article_title)
#     valid_sentences = [processed_title]
#     chunks = re.split(r'\n|/|\\', article_text)
#     for chunk in chunks:
#         chunk = preprocess_chunk(chunk)
#         if len(chunk.split()) >= 10:
#             doc = nlp(chunk)
#             for sent in doc.sents:
#                 if is_complete_sentence(sent.text):
#                     valid_sentences.append(sent.text)
#     return ' '.join(valid_sentences)

# # Keywords for filtering
# keywords = ['AI', 'ML', 'Artificial Intelligence', 'Machine Learning', 'Data', 'Deep Learning', 'Neural Network', 'NLP', 'Natural Language Processing', 'Computer Vision', 'Analytics']
# keywords_re = re.compile('|'.join(keywords), re.IGNORECASE)

# def contains_keywords(text):
#     return bool(keywords_re.search(text))

# df_filtered = df_news_final_project_whole[df_news_final_project_whole['text'].apply(contains_keywords)]
# df_filtered['token_count'] = df_filtered['text'].apply(lambda x: len(x.split()))

# Q1 = df_filtered['token_count'].quantile(0.25)
# Q3 = df_filtered['token_count'].quantile(0.75)
# IQR = Q3 - Q1
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# df_filtered_final = df_filtered[(df_filtered['token_count'] >= lower_bound) & (df_filtered['token_count'] <= upper_bound)]
# df_filtered_final['cleaned_text'] = df_filtered_final.apply(lambda row: clean_article_text(row['text'], row['title']), axis=1)

CPU times: user 10h 16min 29s, sys: 2min 17s, total: 10h 18min 47s
Wall time: 10h 18min 57s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [33]:
df_filtered_final.shape

(192383, 7)

In [35]:
%%time

df_filtered_final.to_parquet("./data/news_final_project_clean.parquet")

CPU times: user 10.5 s, sys: 5.41 s, total: 15.9 s
Wall time: 36.6 s


In [6]:
df_news_final_project["text"].iloc[9]

'\n\nArtificial Intelligence As A Service Market : Opportunities, Market Analysis & Outlook To 2027 – 3w Market News Reports\n\n \n\n \nContact Us\nAbout Us\n \n3w Market News Reports\n3rd Market Reports and Analytics\nNews\nMarket Reports\nIndustry Analytics\nIndustry Reports\nMarket Research\nBusiness Opportunity\nEmerging Trends\nGrowth Prospects\n \n \n \nHomeIndustryArtificial Intelligence As A Service Market : Opportunities, Market Analysis & Outlook To 2027 \n\n                Artificial Intelligence As A Service Market : Opportunities, Market Analysis & Outlook To 2027            \n\n\t\t\t                    \tPosted On: April 29, 2020 \n\n\t\t\t                    \tPosted By: [email\xa0protected] \n\n\t\t\t                \t\tComments: 0 \n\xa0\nThe latest research report on Artificial Intelligence As A Service Market aims to analyze the Market Size, Share, Emerging Trends, Opportunities in global Artificial Intelligence As A Service industry. Simultaneously report presents 

In [7]:
df_news_final_project["cleaned_text"].iloc[9]

KeyError: 'cleaned_text'