In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data = data.drop(columns='Unnamed: 0')

In [4]:
data.head(5)

Unnamed: 0,NewID,Category,Title,Abstract
0,N88753,lifestyle,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N45436,news,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...
2,N23144,health,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
3,N86255,health,Dispose of unwanted prescription drugs during ...,
4,N93187,news,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...


In [5]:
data.shape

(81222, 4)

# Data Preprocessing

Removing NaN values 

In [6]:
data = data.dropna()

In [7]:
data.shape

(77100, 4)

In [8]:
data.head(5)

Unnamed: 0,NewID,Category,Title,Abstract
0,N88753,lifestyle,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N45436,news,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...
2,N23144,health,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
4,N93187,news,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
5,N75236,health,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."


Converting all to lower case

In [9]:
data['Title'] = data['Title'].str.lower()
data['Abstract'] = data['Abstract'].str.lower()

In [10]:
data.head(5)

Unnamed: 0,NewID,Category,Title,Abstract
0,N88753,lifestyle,"the brands queen elizabeth, prince charles, an...","shop the notebooks, jackets, and more that the..."
1,N45436,news,walmart slashes prices on last-generation ipads,apple's new ipad releases bring big deals on l...
2,N23144,health,50 worst habits for belly fat,these seemingly harmless habits are holding yo...
4,N93187,news,the cost of trump's aid freeze in the trenches...,lt. ivan molchanets peeked over a parapet of s...
5,N75236,health,i was an nba wife. here's how it affected my m...,"i felt like i was a fraud, and being an nba wi..."


Removing punctuations

In [11]:
import string

punctuation_chars = set(string.punctuation)

data['Title'] = data['Title'].apply(lambda x: ''.join(char for char in x if char not in punctuation_chars))
data['Abstract'] = data['Abstract'].apply(lambda x: ''.join(char for char in x if char not in punctuation_chars))

In [12]:
data.head(5)

Unnamed: 0,NewID,Category,Title,Abstract
0,N88753,lifestyle,the brands queen elizabeth prince charles and ...,shop the notebooks jackets and more that the r...
1,N45436,news,walmart slashes prices on lastgeneration ipads,apples new ipad releases bring big deals on la...
2,N23144,health,50 worst habits for belly fat,these seemingly harmless habits are holding yo...
4,N93187,news,the cost of trumps aid freeze in the trenches ...,lt ivan molchanets peeked over a parapet of sa...
5,N75236,health,i was an nba wife heres how it affected my men...,i felt like i was a fraud and being an nba wif...


Removing stopwords

In [13]:
import nltk
from nltk.corpus import stopwords

In [14]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

data['Title'] = data['Title'].apply(lambda x: ' '.join(word for word in x.split() if word.lower() not in stop_words))
data['Abstract'] = data['Abstract'].apply(lambda x: ' '.join(word for word in x.split() if word.lower() not in stop_words))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bvidhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
data.head(5)

Unnamed: 0,NewID,Category,Title,Abstract
0,N88753,lifestyle,brands queen elizabeth prince charles prince p...,shop notebooks jackets royals cant live without
1,N45436,news,walmart slashes prices lastgeneration ipads,apples new ipad releases bring big deals last ...
2,N23144,health,50 worst habits belly fat,seemingly harmless habits holding back keeping...
4,N93187,news,cost trumps aid freeze trenches ukraines war,lt ivan molchanets peeked parapet sand bags fr...
5,N75236,health,nba wife heres affected mental health,felt like fraud nba wife didnt help fact nearl...


Lemmatization

In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [17]:
def lemmatize_text_spacy(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)

In [18]:
data['Title'] = data['Title'].apply(lemmatize_text_spacy)
data['Abstract'] = data['Abstract'].apply(lemmatize_text_spacy)

In [19]:
data.head(5)

Unnamed: 0,NewID,Category,Title,Abstract
0,N88753,lifestyle,brands queen elizabeth prince charles prince p...,shop notebooks jacket royal ca nt live without
1,N45436,news,walmart slash price lastgeneration ipad,apple new ipad release bring big deal last yea...
2,N23144,health,50 bad habit belly fat,seemingly harmless habit hold back keep shed u...
4,N93187,news,cost trumps aid freeze trench ukraines war,lt ivan molchanet peek parapet sand bag front ...
5,N75236,health,nba wife here affect mental health,feel like fraud nba wife did nt help fact near...


Removing non-aphanumeric and non-digit characters 

In [20]:
import re

In [21]:
def clean_text(text):
    # Remove non-alphanumeric and non-digit characters
    cleaned_text = re.sub(r'\W+', ' ', text)
    return cleaned_text

# Apply cleaning to 'Title' and 'Abstract' columns
data['Title'] = data['Title'].apply(clean_text)
data['Abstract'] = data['Abstract'].apply(clean_text)

In [22]:
data.head(5)

Unnamed: 0,NewID,Category,Title,Abstract
0,N88753,lifestyle,brands queen elizabeth prince charles prince p...,shop notebooks jacket royal ca nt live without
1,N45436,news,walmart slash price lastgeneration ipad,apple new ipad release bring big deal last yea...
2,N23144,health,50 bad habit belly fat,seemingly harmless habit hold back keep shed u...
4,N93187,news,cost trumps aid freeze trench ukraines war,lt ivan molchanet peek parapet sand bag front ...
5,N75236,health,nba wife here affect mental health,feel like fraud nba wife did nt help fact near...


Removing numbers

In [23]:
data['Title'] = data['Title'].str.replace('\d', '', regex=True)
data['Abstract'] = data['Abstract'].str.replace('\d', '', regex=True)

In [24]:
data.head(5)

Unnamed: 0,NewID,Category,Title,Abstract
0,N88753,lifestyle,brands queen elizabeth prince charles prince p...,shop notebooks jacket royal ca nt live without
1,N45436,news,walmart slash price lastgeneration ipad,apple new ipad release bring big deal last yea...
2,N23144,health,bad habit belly fat,seemingly harmless habit hold back keep shed u...
4,N93187,news,cost trumps aid freeze trench ukraines war,lt ivan molchanet peek parapet sand bag front ...
5,N75236,health,nba wife here affect mental health,feel like fraud nba wife did nt help fact near...


Removing single letters

In [25]:
data['Title'] = data['Title'].str.replace(r'\b\w\b', '', regex=True)
data['Abstract'] = data['Abstract'].str.replace(r'\b\w\b', '', regex=True)

In [26]:
data.head(5)

Unnamed: 0,NewID,Category,Title,Abstract
0,N88753,lifestyle,brands queen elizabeth prince charles prince p...,shop notebooks jacket royal ca nt live without
1,N45436,news,walmart slash price lastgeneration ipad,apple new ipad release bring big deal last yea...
2,N23144,health,bad habit belly fat,seemingly harmless habit hold back keep shed u...
4,N93187,news,cost trumps aid freeze trench ukraines war,lt ivan molchanet peek parapet sand bag front ...
5,N75236,health,nba wife here affect mental health,feel like fraud nba wife did nt help fact near...


In [27]:
data = data.dropna()

In [28]:
data.to_csv('cleaned_data.csv')