# 03.Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import os

import seaborn as sns
import matplotlib.pyplot as plt

import re
import nltk

In [2]:
news = pd.read_csv('../data/clean/news_dataset.csv')
news.head()

Unnamed: 0,Title,Date,URL,Article
0,Country first innovation center for the new p...,2024-07-15,http://en.people.cn/n3/2024/0715/c90000-201939...,Photo shows the innovation center for the n...
1,26th China Beijing International High tech Exp...,2024-07-15,http://en.people.cn/n3/2024/0715/c90000-201938...,A vertical take-off and landing drone is on...
2,China tech innovations benefit electricity de...,2024-07-15,http://en.people.cn/n3/2024/0715/c90000-201938...,BEIJING July 15 Xinhua China is at the ...
3,Diverse applications of new technologies on di...,2024-07-15,http://en.people.cn/n3/2024/0715/c90000-201938...,The 2024 World AI ference WAIC and High-L...
4,AI used to help preserve China oldest wooden ...,2024-07-15,http://en.people.cn/n3/2024/0715/c90000-201936...,The Sakyamuni Pagoda also known as the Yin...


## 1. Lowercasing text

In [6]:
news.columns

Index(['Title', 'Date', 'URL', 'Article'], dtype='object')

In [8]:
news.columns = news.columns.str.lower()

In [10]:
news.head()

Unnamed: 0,title,date,url,article
0,Country first innovation center for the new p...,2024-07-15,http://en.people.cn/n3/2024/0715/c90000-201939...,Photo shows the innovation center for the n...
1,26th China Beijing International High tech Exp...,2024-07-15,http://en.people.cn/n3/2024/0715/c90000-201938...,A vertical take-off and landing drone is on...
2,China tech innovations benefit electricity de...,2024-07-15,http://en.people.cn/n3/2024/0715/c90000-201938...,BEIJING July 15 Xinhua China is at the ...
3,Diverse applications of new technologies on di...,2024-07-15,http://en.people.cn/n3/2024/0715/c90000-201938...,The 2024 World AI ference WAIC and High-L...
4,AI used to help preserve China oldest wooden ...,2024-07-15,http://en.people.cn/n3/2024/0715/c90000-201936...,The Sakyamuni Pagoda also known as the Yin...


In [12]:
news['title'] = news['title'].str.lower()
news['title']

0       country  first innovation center for the new p...
1       26th china beijing international high tech exp...
2       china  tech innovations benefit electricity de...
3       diverse applications of new technologies on di...
4       ai used to help preserve china  oldest wooden ...
                              ...                        
1484    china launches new remote sensing satellite group
1485    china, africa continue deepening cooperation i...
1486    china  tech firm huawei introduces ai initiati...
1487    china launches key meteorological lab for ener...
1488    china  ag600 amphibious aircraft completes fir...
Name: title, Length: 1489, dtype: object

In [14]:
news['article'] = news['article'].str.lower()
news['article']

0          photo shows the innovation center for the n...
1          a vertical take-off and landing drone is on...
2         beijing  july 15  xinhua    china is at the ...
3         the 2024 world ai  ference  waic  and high-l...
4          the sakyamuni pagoda  also known as the yin...
                              ...                        
1484       a long march-4b carrier rocket carrying a n...
1485      in a banana seedling cultivation room of cro...
1486       alan qi  president of huawei cloud middle e...
1487      beijing  sept  2  xinhua    china has launch...
1488       a china  ag600 large amphibious aircraft is...
Name: article, Length: 1489, dtype: object

In [25]:
news['article'].iloc[0]

'   photo shows the innovation center for the new power system in central china  hunan province  the first of this kind in the country that comprehensively showcases the achievements and technological equipment of the new power system    an innovation center for the new power system in central china  hunan province was officially launched on july 11  it is the first of its kind in china that comprehensively showcases the achievements and technological equipment of the new power system  the center was jointly constructed by state grid hunan electric power      and over 60 enterprises and universities  it took 170 days to complete   ering all aspects of the power system  including power generation  transmission  transformation  distribution  storage  and utilization  the center vividly demonstrates the latest technological innovation  model transformation  and green development of the new power system  the exhibition area of the center features company booths  with 60 enterprises current

## 2. Remove stop word

In [27]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nagiur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

In [31]:
news['title'] = news['title'].apply(remove_stop_words)

In [33]:
news['article'] = news['article'].apply(remove_stop_words)

In [35]:
news['title'].iloc[0]

'country first innovation center new power system goes operation c china hunan'

In [37]:
news['article'].iloc[0]

'photo shows innovation center new power system central china hunan province first kind country comprehensively showcases achievements technological equipment new power system innovation center new power system central china hunan province officially launched july 11 first kind china comprehensively showcases achievements technological equipment new power system center jointly constructed state grid hunan electric power 60 enterprises universities took 170 days complete ering aspects power system including power generation transmission transformation distribution storage utilization center vividly demonstrates latest technological innovation model transformation green development new power system exhibition area center features company booths 60 enterprises currently hosted initial group companies showcase booths includes basf shanshan battery materials beijing disaster prevention science technology crrc zhuzhou institute tbea sany group among others innovation center serves bridge con

## 3. Remove `numbers`

In [40]:
news['title'] = news['title'].apply(lambda x: re.sub(r'\d+', '', x))

In [42]:
news['article'] = news['article'].apply(lambda x: re.sub(r'\d+', '', x))

In [44]:
news['title'].iloc[0]

'country first innovation center new power system goes operation c china hunan'

In [46]:
news['article'].iloc[0]

'photo shows innovation center new power system central china hunan province first kind country comprehensively showcases achievements technological equipment new power system innovation center new power system central china hunan province officially launched july  first kind china comprehensively showcases achievements technological equipment new power system center jointly constructed state grid hunan electric power  enterprises universities took  days complete ering aspects power system including power generation transmission transformation distribution storage utilization center vividly demonstrates latest technological innovation model transformation green development new power system exhibition area center features company booths  enterprises currently hosted initial group companies showcase booths includes basf shanshan battery materials beijing disaster prevention science technology crrc zhuzhou institute tbea sany group among others innovation center serves bridge connecting u

## 4. Lemmatization or stemming

In [49]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/nagiur/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [55]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

In [59]:
news['title'] = news['title'].apply(lemmatize_text)

In [None]:
news['article'] = news['article'].apply(lematize_text)

In [None]:
news['title'].iloc[0]

In [None]:
news['article'].iloc[0]

## 5. Handling contractions

In [None]:
import contractions

def expand_contractions(text):
    return contractions.fix(text)

In [None]:
news['title'] = news['title'].apply(expand_contractions)

In [None]:
news['article'] = news['article'].apply(expand_contractions)

In [None]:
news['title'].iloc[0]

In [None]:
news['article'].iloc[0]

## 6. Removing extra whitespaces

In [159]:
news['title'] = news['title'].apply(lambda x: x.strip())

In [161]:
news['article'] = news['article'].apply(lambda x: x.strip())

In [163]:
news['title'].iloc[0]

'country first innovation center new power system go operation c china hunan'

In [34]:
news['article'].iloc[0]

'photo shows innovation center new power system central china hunan province first kind country comprehensively showcases achievements technological equipment new power system innovation center new power system central china hunan province officially launched july  first kind china comprehensively showcases achievements technological equipment new power system center jointly constructed state grid hunan electric power  enterprises universities took  days complete ering aspects power system including power generation transmission transformation distribution storage utilization center vividly demonstrates latest technological innovation model transformation green development new power system exhibition area center features company booths  enterprises currently hosted initial group companies showcase booths includes basf shanshan battery materials beijing disaster prevention science technology crrc zhuzhou institute tbea sany group among others innovation center serves bridge connecting u

## Save as `preprocessed/news_preprocessed.csv`

In [47]:
news.to_csv('../data/preprocessed/news_preprocessed.csv', index=False)