# Data Preprocessing for Medium articles dataset

## Imports

In [1]:
import pandas as pd
from data_preprocessing_util import clean_text

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Loading

In [2]:
%%time

all_data = pd.read_csv("../Datasets/Medium Articles/Medium_AggregatedData.csv", usecols = ['text'])
all_data.head()

CPU times: user 13.3 s, sys: 1.07 s, total: 14.4 s
Wall time: 24.7 s


Unnamed: 0,text
0,"Private Business, Government and Blockchain\n\..."
1,"Private Business, Government and Blockchain\n\..."
2,"Private Business, Government and Blockchain\n\..."
3,EPQ draft 1 (4844 words)\nhttps://upload.wikim...
4,EPQ draft 1 (4844 words)\nhttps://upload.wikim...


In [3]:
print(f"Total records: {len(all_data)}")

Total records: 279577


## Data Cleaning

### Remove duplicates

In [4]:
all_data = all_data.drop_duplicates().reset_index(drop = True)
all_data.head()

Unnamed: 0,text
0,"Private Business, Government and Blockchain\n\..."
1,EPQ draft 1 (4844 words)\nhttps://upload.wikim...
2,"Ascent of data Science, SAS and Big data Analy..."
3,Can a robot love us better than another human ...
4,"2017 Big Data, AI and IOT Use Cases\nAn Active..."


In [5]:
print(f"Total unique records: {len(all_data)}")

Total unique records: 72024


### Remove very short text

In [6]:
all_data['text_len'] = all_data['text'].str.len()
all_data.describe()

Unnamed: 0,text_len
count,72024.0
mean,5486.830959
std,5109.26508
min,1.0
25%,2517.0
50%,4307.0
75%,6880.0
max,144509.0


In [7]:
all_data = all_data[all_data['text_len'] > 16]
all_data.describe()

Unnamed: 0,text_len
count,72004.0
mean,5488.352744
std,5109.158515
min,17.0
25%,2518.0
50%,4308.0
75%,6882.0
max,144509.0


### Remove unwanted characters and stopwords

In [8]:
%%time

all_data = all_data.astype('str')
all_data['text'] = all_data['text'].apply(clean_text)
all_data = all_data[['text']]
all_data.head()

CPU times: user 48 s, sys: 683 ms, total: 48.7 s
Wall time: 48.7 s


Unnamed: 0,text
0,private business government blockchaina major ...
1,epq draft wordshttpsuploadwikimediaorgwikipedi...
2,ascent data science sas big data analyst train...
3,robot love us better another human cani discus...
4,big data ai iot use casesan active list intere...


## Save cleaned data as csv

In [9]:
all_data.to_csv("../Processed Data/medium_articles_cleaned.csv", index = False)