In [1]:
import os
import re
import nltk
import sqlite3
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /home/mr_bot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mr_bot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mr_bot/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/mr_bot/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# 1. Connect to the database and fetch data
path = os.path.join('..', 'Database', 'news.db')
conn = sqlite3.connect(path)
df = pd.read_sql("SELECT id, text, label FROM news", conn)

In [4]:
df.head(10)

Unnamed: 0,id,text,label
0,1,WASHINGTON (Reuters) - The head of a conservat...,real
1,2,WASHINGTON (Reuters) - Transgender people will...,real
2,3,WASHINGTON (Reuters) - The special counsel inv...,real
3,4,WASHINGTON (Reuters) - Trump campaign adviser ...,real
4,5,SEATTLE/WASHINGTON (Reuters) - President Donal...,real
5,6,"WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",real
6,7,"WEST PALM BEACH, Fla (Reuters) - President Don...",real
7,8,The following statements were posted to the ve...,real
8,9,The following statements were posted to the ve...,real
9,10,WASHINGTON (Reuters) - Alabama Secretary of St...,real


In [5]:
# 2. Initialize stopwords, stemmer, and lemmatizer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [6]:
def preprocess_text(text):
    # 3. Convert text to lowercase
    text = text.lower()
    
    # 4. Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text) 
    
    # 5. Tokenize text into words
    words = word_tokenize(text)
    
    # 6. Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # 7. Apply lemmatization (or stemming as an alternative)
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # 8. Convert the list of words back to a string
    return " ".join(words)

In [7]:
# Apply preprocessing to all text data
df["cleaned_text"] = df["text"].apply(preprocess_text)

In [8]:
df.head()

Unnamed: 0,id,text,label,cleaned_text
0,1,WASHINGTON (Reuters) - The head of a conservat...,real,washington reuters head conservative republica...
1,2,WASHINGTON (Reuters) - Transgender people will...,real,washington reuters transgender people allowed ...
2,3,WASHINGTON (Reuters) - The special counsel inv...,real,washington reuters special counsel investigati...
3,4,WASHINGTON (Reuters) - Trump campaign adviser ...,real,washington reuters trump campaign adviser geor...
4,5,SEATTLE/WASHINGTON (Reuters) - President Donal...,real,seattlewashington reuters president donald tru...


In [9]:
df.drop(['text'], axis=1, inplace=True)
df = df.loc[:, ['id', 'cleaned_text', 'label']]

In [10]:
df.head()

Unnamed: 0,id,cleaned_text,label
0,1,washington reuters head conservative republica...,real
1,2,washington reuters transgender people allowed ...,real
2,3,washington reuters special counsel investigati...,real
3,4,washington reuters trump campaign adviser geor...,real
4,5,seattlewashington reuters president donald tru...,real


In [11]:
#df.to_sql('cleanedText', conn, if_exists="append", index=False)