# NLP Final Project Part I: Data Preprocessing




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')


In [None]:
import os
import time
import math
from pprint import pprint
from textblob import TextBlob
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt


import nltk as nltk
from nltk.corpus import stopwords
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer


### Load Data

In [None]:
df_news_final_project = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
df_news_final_project.shape

(199538, 5)

In [None]:
df_news_final_project.head()

Unnamed: 0,url,date,language,title,text
0,http://blog.executivebiz.com/2017/02/sap-ns2s-...,2017-02-07,en,SAP NS2’s Mark Testoni: AI Could Help Prevent ...,\nSAP NS2’s Mark Testoni: AI Could Help Preve...
1,http://bq-magazine.com/a-decade-of-advancement...,2019-12-31,en,A Decade Of Advancements As We Enter A New Age...,\n\nA Decade Of Advancements As We Enter A New...
2,http://businessnewsthisweek.com/business/gende...,2022-03-09,en,Gender Bias in Artificial Intelligence | Busin...,\nGender Bias in Artificial Intelligence | Bus...
3,http://cn.reuters.com/article/pharmaceuticals-...,2017-07-04,en,Big pharma turns to AI to speed drug discovery...,\n\nBig pharma turns to AI to speed drug disco...
4,http://fortune.com/2018/03/13/ai-computer-amaz...,2018-03-13,en,"Amazon HQ2 Winner May Be Boston, Says Wells Fa...","\n\nAmazon HQ2 Winner May Be Boston, Says Well..."


In [None]:
df = df_news_final_project

In [None]:
pd.set_option('display.max_colwidth', 100)

# Data Preprocessing
### Clean-up the noise, by eliminating newlines, tabs, remnants of web crawls, and other irrelevant text

In [None]:
# clean article text
df["clean_text"] = df["text"].apply(lambda x: re.sub(r'\n', '',x)) # remove new line
df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(r'https://.*', '',x)) # remove url
df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(r'@\w*', '',x)) # remove mention
df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(r'#\w*', '',x)) # remove tag
# keep only letters, periods, and white space
#df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(r'[^a-zA-Z.\s]', '',x))
# change consecutive white space into 1 whitespace
df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(' +', ' ',x))

In [None]:
 # remove remnants of web crawls
df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(r'\xa0SAP', '',x))
df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(r'\xa0National', '',x))
df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(r'\xa0', '',x))
df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(r'\t', '',x))
df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(r'\r', '',x))
df["clean_text"] = df["clean_text"].apply(lambda x: re.sub(r'\|', '',x))

In [None]:
# clean article title
df["clean_title"] = df["title"].apply(lambda x: re.sub(r'\n', '',x)) # remove new line
df["clean_title"] = df["clean_title"].apply(lambda x: re.sub(r'https://.*', '',x)) # remove url
df["clean_title"] = df["clean_title"].apply(lambda x: re.sub(r'@\w*', '',x)) # remove mention
df["clean_title"] = df["clean_title"].apply(lambda x: re.sub(r'#\w*', '',x)) # remove tag
# change consecutive white space into 1 whitespace
df["clean_title"] = df["clean_title"].apply(lambda x: re.sub(' +', ' ',x))
# keep only letters, periods, and white space
#df["clean_title"] = df["clean_title"].apply(lambda x: re.sub(r'[^a-zA-Z.\s]', '',x))

### Discard irrelevant articles

In [None]:
keywords = ['AI', 'ai', 'artificial intelligence','Artificial Intelligence','ARTIFICIAL INTELLIGENCE', 'Data Science', 'data science', 'DATA SCIENCE']
query = '|'.join(keywords)

In [None]:
df_clean = df[df['clean_text'].str.contains(query)]

In [None]:
df_clean = df[df['clean_title'].str.contains(query)]

In [None]:
df_clean.shape

(171652, 7)

### Drop Duplicates

In [None]:
df_clean = df_clean[["date","clean_title","clean_text"]].drop_duplicates()

In [None]:
df_clean.shape

(168797, 3)

In [None]:
df_clean.head(3)

Unnamed: 0,date,clean_title,clean_text
0,2017-02-07,SAP NS2’s Mark Testoni: AI Could Help Prevent Cyber Attacks Through Threat Classification | Exec...,SAP NS2’s Mark Testoni: AI Could Help Prevent Cyber Attacks Through Threat Classification Exec...
1,2019-12-31,A Decade Of Advancements As We Enter A New Age Of AI - Business Quick Magazine,A Decade Of Advancements As We Enter A New Age Of AI - Business Quick Magazine HomeBusinessTechn...
2,2022-03-09,Gender Bias in Artificial Intelligence | Business News This Week,"Gender Bias in Artificial Intelligence Business News This WeekBusiness News [ March 9, 2022 ] Z..."


### Export cleaned text to csv

In [None]:
df_clean.to_csv("clean_text.csv", index = False)

In [None]:
df = df_clean

### Word Counts in clean text

In [None]:
def num_of_words(df):
    df['length'] = df['clean_text'].apply(lambda x : len(str(x).split()))
    df = df.reset_index(drop = True)
    #df = df.sort_values(by = ['length'], ascending = False).reset_index(drop = True)
    return df

In [None]:
## Count number of words in the text
df_count = num_of_words(df)

In [None]:
df_count.describe()

Unnamed: 0,length
count,168797.0
mean,925.124439
std,904.795189
min,5.0
25%,475.0
50%,749.0
75%,1146.0
max,21119.0


In [None]:
df_count.head()

Unnamed: 0,date,clean_title,clean_text,length
0,2017-02-07,SAP NS2’s Mark Testoni: AI Could Help Prevent Cyber Attacks Through Threat Classification | Exec...,SAP NS2’s Mark Testoni: AI Could Help Prevent Cyber Attacks Through Threat Classification Exec...,660
1,2019-12-31,A Decade Of Advancements As We Enter A New Age Of AI - Business Quick Magazine,A Decade Of Advancements As We Enter A New Age Of AI - Business Quick Magazine HomeBusinessTechn...,1455
2,2022-03-09,Gender Bias in Artificial Intelligence | Business News This Week,"Gender Bias in Artificial Intelligence Business News This WeekBusiness News [ March 9, 2022 ] Z...",1253
3,2017-07-04,"Big pharma turns to AI to speed drug discovery, GSK signs deal | 路透中文网","Big pharma turns to AI to speed drug discovery, GSK signs deal 路透中文网Discover Thomson ReutersFin...",712
4,2018-03-13,"Amazon HQ2 Winner May Be Boston, Says Wells Fargo AI Program | Fortune","Amazon HQ2 Winner May Be Boston, Says Wells Fargo AI Program Fortune Wells Fargo's AI Computer ...",728
