In [1]:
import pandas as pd
df = pd.read_csv('cleaned_dataset.csv')
print(df.head())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                             content  label  
0  WASHINGTON (Reuters) - The head of a conservat...    1.0  
1  WASHINGTON (Reuters) - Transgender people will...    1.0  
2  WASHINGTON (Reuters) - The special counsel inv...    1.0  
3  WASHINGTON (Reuters) - Trump campaign adviser ...    1.0  
4  SEATTLE/WASHINGTON (Reuters) - President Donal...    1.0  


In [3]:
import re
# Convert text to lowercase and remove special characters and remove whitespaces
df['clean_message'] = df['content'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
df['clean_message'] = df['clean_message'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
print(df[['content', 'clean_message']].head())


                                             content  \
0  WASHINGTON (Reuters) - The head of a conservat...   
1  WASHINGTON (Reuters) - Transgender people will...   
2  WASHINGTON (Reuters) - The special counsel inv...   
3  WASHINGTON (Reuters) - Trump campaign adviser ...   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...   

                                       clean_message  
0  washington reuters the head of a conservative ...  
1  washington reuters transgender people will be ...  
2  washington reuters the special counsel investi...  
3  washington reuters trump campaign adviser geor...  
4  seattlewashington reuters president donald tru...  


In [5]:
df

Unnamed: 0,title,content,label,clean_message
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,1.0,washington reuters the head of a conservative ...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,1.0,washington reuters transgender people will be ...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,1.0,washington reuters the special counsel investi...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,1.0,washington reuters trump campaign adviser geor...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,1.0,seattlewashington reuters president donald tru...
...,...,...,...,...
39100,The White House and The Theatrics of ‘Gun Cont...,21st Century Wire says All the world s a stage...,0.0,21st century wire says all the world s a stage...
39101,Activists or Terrorists? How Media Controls an...,Randy Johnson 21st Century WireThe majority ...,0.0,randy johnson 21st century wirethe majority of...
39102,"BOILER ROOM – No Surrender, No Retreat, Heads ...",Tune in to the Alternate Current Radio Network...,0.0,tune in to the alternate current radio network...
39103,Federal Showdown Looms in Oregon After BLM Abu...,21st Century Wire says A new front has just op...,0.0,21st century wire says a new front has just op...


In [7]:
# Making the content of the news to the tokens
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Tokenize the clean text
df['tokenized_message'] = df['clean_message'].apply(word_tokenize)

# Check tokenized messages
print(df[['clean_message', 'tokenized_message']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prajw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                       clean_message  \
0  washington reuters the head of a conservative ...   
1  washington reuters transgender people will be ...   
2  washington reuters the special counsel investi...   
3  washington reuters trump campaign adviser geor...   
4  seattlewashington reuters president donald tru...   

                                   tokenized_message  
0  [washington, reuters, the, head, of, a, conser...  
1  [washington, reuters, transgender, people, wil...  
2  [washington, reuters, the, special, counsel, i...  
3  [washington, reuters, trump, campaign, adviser...  
4  [seattlewashington, reuters, president, donald...  


In [9]:
# Removeing the Stopwords From the content of the news
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Remove stopwords
df['filtered_message'] = df['tokenized_message'].apply(lambda x: [word for word in x if word not in stop_words])

# Check filtered messages
print(df[['tokenized_message', 'filtered_message']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prajw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                   tokenized_message  \
0  [washington, reuters, the, head, of, a, conser...   
1  [washington, reuters, transgender, people, wil...   
2  [washington, reuters, the, special, counsel, i...   
3  [washington, reuters, trump, campaign, adviser...   
4  [seattlewashington, reuters, president, donald...   

                                    filtered_message  
0  [washington, reuters, head, conservative, repu...  
1  [washington, reuters, transgender, people, all...  
2  [washington, reuters, special, counsel, invest...  
3  [washington, reuters, trump, campaign, adviser...  
4  [seattlewashington, reuters, president, donald...  


In [11]:
# Performing the Lemmatization Technique:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# Lemmatize the filtered tokens
df['lemmatized_message'] = df['filtered_message'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Check tokenized, filtered, and lemmatized messages
print(df[['filtered_message', 'lemmatized_message']].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prajw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                    filtered_message  \
0  [washington, reuters, head, conservative, repu...   
1  [washington, reuters, transgender, people, all...   
2  [washington, reuters, special, counsel, invest...   
3  [washington, reuters, trump, campaign, adviser...   
4  [seattlewashington, reuters, president, donald...   

                                  lemmatized_message  
0  [washington, reuters, head, conservative, repu...  
1  [washington, reuters, transgender, people, all...  
2  [washington, reuters, special, counsel, invest...  
3  [washington, reuters, trump, campaign, adviser...  
4  [seattlewashington, reuters, president, donald...  


In [13]:
# Performing the Steming using Snowballstemmer
import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')  # Using SnowballStemmer for stemming
# Step 2: Stem the filtered tokens using SnowballStemmer
df['content1'] = df['lemmatized_message'].apply(lambda x: [stemmer.stem(word) for word in x])

# Check tokenized, filtered, and stemmed messages
print(df[['lemmatized_message', 'content1']].head())

                                  lemmatized_message  \
0  [washington, reuters, head, conservative, repu...   
1  [washington, reuters, transgender, people, all...   
2  [washington, reuters, special, counsel, invest...   
3  [washington, reuters, trump, campaign, adviser...   
4  [seattlewashington, reuters, president, donald...   

                                            content1  
0  [washington, reuter, head, conserv, republican...  
1  [washington, reuter, transgend, peopl, allow, ...  
2  [washington, reuter, special, counsel, investi...  
3  [washington, reuter, trump, campaign, advis, g...  
4  [seattlewashington, reuter, presid, donald, tr...  


In [15]:
df.drop(['content','clean_message','tokenized_message','filtered_message','lemmatized_message'],axis=1,inplace=True)

In [17]:
#final dataset
df

Unnamed: 0,title,label,content1
0,"As U.S. budget fight looms, Republicans flip t...",1.0,"[washington, reuter, head, conserv, republican..."
1,U.S. military to accept transgender recruits o...,1.0,"[washington, reuter, transgend, peopl, allow, ..."
2,Senior U.S. Republican senator: 'Let Mr. Muell...,1.0,"[washington, reuter, special, counsel, investi..."
3,FBI Russia probe helped by Australian diplomat...,1.0,"[washington, reuter, trump, campaign, advis, g..."
4,Trump wants Postal Service to charge 'much mor...,1.0,"[seattlewashington, reuter, presid, donald, tr..."
...,...,...,...
39100,The White House and The Theatrics of ‘Gun Cont...,0.0,"[21st, centuri, wire, say, world, stage, men, ..."
39101,Activists or Terrorists? How Media Controls an...,0.0,"[randi, johnson, 21st, centuri, wireth, major,..."
39102,"BOILER ROOM – No Surrender, No Retreat, Heads ...",0.0,"[tune, altern, current, radio, network, acr, a..."
39103,Federal Showdown Looms in Oregon After BLM Abu...,0.0,"[21st, centuri, wire, say, new, front, open, l..."


In [19]:
# Converting flaot label value to integer
df['label'] = df['label'].astype(int)

In [21]:
df
#final dataset

Unnamed: 0,title,label,content1
0,"As U.S. budget fight looms, Republicans flip t...",1,"[washington, reuter, head, conserv, republican..."
1,U.S. military to accept transgender recruits o...,1,"[washington, reuter, transgend, peopl, allow, ..."
2,Senior U.S. Republican senator: 'Let Mr. Muell...,1,"[washington, reuter, special, counsel, investi..."
3,FBI Russia probe helped by Australian diplomat...,1,"[washington, reuter, trump, campaign, advis, g..."
4,Trump wants Postal Service to charge 'much mor...,1,"[seattlewashington, reuter, presid, donald, tr..."
...,...,...,...
39100,The White House and The Theatrics of ‘Gun Cont...,0,"[21st, centuri, wire, say, world, stage, men, ..."
39101,Activists or Terrorists? How Media Controls an...,0,"[randi, johnson, 21st, centuri, wireth, major,..."
39102,"BOILER ROOM – No Surrender, No Retreat, Heads ...",0,"[tune, altern, current, radio, network, acr, a..."
39103,Federal Showdown Looms in Oregon After BLM Abu...,0,"[21st, centuri, wire, say, new, front, open, l..."
