In [None]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

In [2]:
df = pd.read_csv('data_aggregation/dataset_for_project/cs5246_data_for_final_proj.csv')

In [3]:
df.head()

Unnamed: 0,text,label,source,word_count,char_count
0,Donald Trump just couldn t wish all Americans ...,0,NewsDataSet,495,2893
1,House Intelligence Committee Chairman Devin Nu...,0,NewsDataSet,305,1898
2,"On Friday, it was revealed that former Milwauk...",0,NewsDataSet,580,3597
3,"On Christmas day, Donald Trump announced that ...",0,NewsDataSet,444,2774
4,Pope Francis used his annual Christmas Day mes...,0,NewsDataSet,420,2346


In [4]:
df.describe()

Unnamed: 0,label,word_count,char_count
count,97340.0,97340.0,97340.0
mean,0.602856,462.951377,2806.553298
std,0.489309,549.671312,3282.985534
min,0.0,0.0,1.0
25%,0.0,147.0,899.0
50%,1.0,358.0,2170.0
75%,1.0,582.0,3538.0
max,1.0,24234.0,142961.0


In [5]:
# remove very long documents (word count)
df = df[df["word_count"] < 800]

# remove very short documents (word count)
df = df[df["word_count"] > 50]
df.reset_index(drop=True, inplace=True)

In [6]:
df.describe()

Unnamed: 0,label,word_count,char_count
count,74319.0,74319.0,74319.0
mean,0.630202,343.36529,2093.330629
std,0.482753,193.126066,1178.759544
min,0.0,51.0,264.0
25%,0.0,179.0,1091.0
50%,1.0,339.0,2055.0
75%,1.0,469.0,2845.0
max,1.0,799.0,7467.0


In [7]:
df[["label", "text"]].groupby("label").count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,27483
1,46836


Quite unbalanced as expected; there should be more real info than fake info. Might need to use some undersampling techniques.

### *Case Folding*


In [8]:
df["text"] = df["text"].str.lower()

### *Remove stop words*

In [9]:
def remove_stop_words(sentence):
    words = sentence.split()
    filtered_words = [word for word in words if word.lower() not in ENGLISH_STOP_WORDS]
    return ' '.join(filtered_words)

df["text"] = df["text"].apply(remove_stop_words)

### *Lemmatisation*

In [10]:
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('punkt')

#  Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/paopao_ch/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/paopao_ch/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/paopao_ch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Function to lemmatize the text
def lemmatize_text(text):
    words = word_tokenize(text)  # Tokenize the text into words
    lemmatized_words = [wnl.lemmatize(word) for word in words]  # Lemmatize each word
    return ' '.join(lemmatized_words)  # Join words back into a sentence

In [12]:
# Apply lemmatization to the 'text' column
df['text'] = df['text'].apply(lemmatize_text)

In [33]:
df.head()

Unnamed: 0,text,label,source,word_count,char_count
0,donald trump just couldn t wish american happy...,0,NewsDataSet,495,2893
1,house intelligence committee chairman devin nu...,0,NewsDataSet,305,1898
2,"friday , revealed milwaukee sheriff david clar...",0,NewsDataSet,580,3597
3,"christmas day , donald trump announced work fo...",0,NewsDataSet,444,2774
4,pope francis used annual christmas day message...,0,NewsDataSet,420,2346


In [35]:
df.to_csv("data_aggregation/dataset_for_project/cs5246_data_for_final_proj_lemmatised.csv", index=False)