In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
# Mount your Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
# Read the dataset from the specified path
df = pd.read_csv('/content/drive/MyDrive/ML project/fake_real.csv', sep=',', encoding='utf-8', quotechar='"')


In [19]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,title,text,subject,date,type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


In [20]:
# Drop all columns except 'text' and 'type'
df = df[['text', 'type']]

# Verify the resulting DataFrame
df.head()

Unnamed: 0,text,type
0,WASHINGTON (Reuters) - The head of a conservat...,True
1,WASHINGTON (Reuters) - Transgender people will...,True
2,WASHINGTON (Reuters) - The special counsel inv...,True
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True


**Text processing function using NLTK**

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

# Download NLTK resources if not already present
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Text processing function using NLTK
def preprocess_nltk(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Remove punctuation marks, special characters, and digits
    tokens = [token for token in tokens if token not in string.punctuation and not token.isdigit()]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Apply text processing to the 'text' column
df['Cleaned_Text'] = df['text'].apply(preprocess_nltk)

# Display processed 'text' column
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,type,Cleaned_Text
0,WASHINGTON (Reuters) - The head of a conservat...,True,WASHINGTON Reuters head conservative Republica...
1,WASHINGTON (Reuters) - Transgender people will...,True,WASHINGTON Reuters Transgender people allowed ...
2,WASHINGTON (Reuters) - The special counsel inv...,True,WASHINGTON Reuters special counsel investigati...
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,WASHINGTON Reuters Trump campaign adviser Geor...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,SEATTLE/WASHINGTON Reuters President Donald Tr...


**Text Processing with Gensim**
* Gensim, a library for topic modeling and similarity detection, lacks built-in functionality for all text preprocessing tasks. It only provides tokenization, but not punctuation marks, special characters, numbers/digits, stopwords removal, lemmatization, or joining tokens back into a single string. Additional libraries are needed for more advanced tasks.

In [14]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.porter import PorterStemmer

# Text processing function using Gensim
def preprocess_gensim(text):
    # Tokenization and remove stopwords
    tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS]
    # Remove punctuation marks, special characters, and digits
    tokens = [re.sub(r'\W+', '', token) for token in tokens if not token.isdigit()]
    # Lemmatization
    return ' '.join(tokens)
# Apply text processing to the 'text' column
df['Cleaned_Text'] = df['text'].apply(preprocess_gensim)

# Display processed 'text' column
df.head()

Unnamed: 0,text,type,Cleaned_Text
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuters head conservative republica...
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allowed ...
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuters special counsel investigati...
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser geor...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattle washington reuters president donald tr...


**Text Processing with spaCy**
* "en_core_web_sm" is a smaller,while "en_core_web_ln" is a larger

In [17]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Text processing function using spaCy
def preprocess_spacy(text):
    # Process text with spaCy
    doc = nlp(text)
    # Remove punctuation marks, special characters, and numbers/digits
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space and not token.is_digit]
    # Join tokens back into a single string
    return ' '.join(tokens)

# Apply text processing to the 'text' column
df['Cleaned_Text'] = df['text'].apply(preprocess_nltk)

# Display processed 'text' column
df.head()

Unnamed: 0,title,text,subject,date,type,Cleaned_Text
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True,WASHINGTON Reuters head conservative Republica...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True,WASHINGTON Reuters Transgender people allowed ...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True,WASHINGTON Reuters special counsel investigati...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True,WASHINGTON Reuters Trump campaign adviser Geor...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True,SEATTLE/WASHINGTON Reuters President Donald Tr...


1. **In general**, while **spaCy** may have slightly longer processing times then **nltk** due to its more comprehensive functionality and language model loading
2. If we prioritize speed, efficiency, and accuracy in text processing, spaCy may be the better choice.
3. However, if we require more flexibility, customization, and a wide range of NLP tools, **NLTK** might be a better fit.

In [21]:
import re
import string
import nltk
import spacy

# Download NLTK resources if not already present
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize NLTK components
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess_text(text):
    # Tokenization using spaCy
    doc = nlp(text)
    tokens_spacy = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space and not token.is_digit]

    # Tokenization and remove stopwords using NLTK
    tokens_nltk = nltk.word_tokenize(text.lower())
    tokens_nltk = [token for token in tokens_nltk if token not in stop_words and token not in string.punctuation]

    # Combine tokens from both NLTK and spaCy
    tokens_combined = tokens_spacy + tokens_nltk

    # Lemmatization using NLTK
    tokens_combined = [lemmatizer.lemmatize(token) for token in tokens_combined]

    # Join tokens back into a single string
    processed_text = ' '.join(tokens_combined)

    return processed_text

# Apply text processing to the 'text' column
df['Cleaned_Text'] = df['text'].apply(preprocess_nltk)

# Display processed 'text' column
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,type,Cleaned_Text
0,WASHINGTON (Reuters) - The head of a conservat...,True,WASHINGTON Reuters head conservative Republica...
1,WASHINGTON (Reuters) - Transgender people will...,True,WASHINGTON Reuters Transgender people allowed ...
2,WASHINGTON (Reuters) - The special counsel inv...,True,WASHINGTON Reuters special counsel investigati...
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,WASHINGTON Reuters Trump campaign adviser Geor...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,SEATTLE/WASHINGTON Reuters President Donald Tr...


In this function:

1. We use spaCy for tokenization and lemmatization to take advantage of its efficiency and accuracy.
2. We use NLTK for additional tokenization, stopwords removal, and lemmatization.
3. We combine tokens from both NLTK and spaCy to leverage the strengths of both libraries.
4. Finally, we join the processed tokens back into a single string.