In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from spacy import displacy
from spacy import tokenizer
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LsiModel, TfidfModel
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# set plot options
plt.rcParams['figure.figsize'] = (12, 8)
default_plot_colour ="#00bfbf"


In [None]:
data = pd.read_csv('fake_news_data.csv')
data.head()

In [None]:
data.info()

In [None]:
# plot the number of articles in each category
data['fake_or_factual'].value_counts().plot(kind='bar', color=default_plot_colour)
plt.title('Number of Articles in Each Category')
plt.xlabel('Category')
plt.ylabel('Number of Articles')
plt.xticks(rotation=0)
plt.show()

# POS Tagging

**Part-of-Speech (POS) tagging** is the process of labeling each word in a text with its grammatical role, such as noun, verb, adjective, etc. POS tagging helps computers understand the structure and meaning of sentences by identifying how words function in context.

**Common POS tags include:**
- Noun (N): person, place, thing (e.g., "dog", "city")
- Verb (V): action or state (e.g., "run", "is")
- Adjective (ADJ): describes a noun (e.g., "happy", "blue")
- Adverb (ADV): modifies a verb, adjective, or adverb (e.g., "quickly", "very")
- Pronoun (PRON): replaces a noun (e.g., "he", "they")
- Preposition (PREP): shows relationship (e.g., "in", "on")
- Conjunction (CONJ): connects words or phrases (e.g., "and", "but")
- Determiner (DET): specifies a noun (e.g., "the", "some")

**Why is POS tagging important?**
- Enables deeper text analysis and understanding
- Useful for information extraction, named entity recognition, and syntactic parsing
- Helps improve the accuracy of downstream NLP tasks like sentiment analysis and machine translation




In [None]:
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

In [None]:
fake_news = data[data['fake_or_factual'] == 'Fake News']
fake_news.head()


In [None]:

factual_news = data[data['fake_or_factual'] == 'Factual News']
factual_news.head()

This approach is much faster than looping over each text with `nlp(text)` because `nlp.pipe` processes texts in batches and is optimized for large datasets.

In [None]:
# fake_spacydocs = [nlp(text) for text in fake_news['text']]
fake_spacydocs = list( nlp.pipe(fake_news['text']))
factual_spacydocs = list( nlp.pipe(factual_news['text']))
# print information about the first document
print(fake_spacydocs[0].text)
print(fake_spacydocs[0].ents)
print(fake_spacydocs[0].ents[0].label_)
print(fake_spacydocs[0].ents[0].text)

In [None]:
def extract_token_tags ( doc: spacy.tokens.doc.Doc):
    """
    Extracts token text and part-of-speech tags from a spaCy Doc object.
    
    Args:
        doc (spacy.tokens.doc.Doc): The spaCy Doc object to extract from.
        
    Returns:
        list: A list of tuples containing token text and its part-of-speech tag.
    """
    return [(token.text, token.ent_type_ ,token.pos_) for token in doc]

In [None]:
fake_tags_df = []
columns = ['token', 'ner_tag', 'pos_tag']


In [None]:

for doc in fake_spacydocs:
    tags = extract_token_tags(doc)
    tags = pd.DataFrame(tags, columns=columns)
    fake_tags_df.append(tags)
fake_tags_df = pd.concat(fake_tags_df, ignore_index=True)
fake_tags_df.head()

In [None]:
factual_tags_df = []
# the columns are the same as for the fake news

In [None]:
factual_tags_df = [ pd.DataFrame(extract_token_tags(doc), columns=columns) for doc in factual_spacydocs ]
factual_tags_df = pd.concat(factual_tags_df, ignore_index=True)
factual_tags_df.head()

# POS Frequency Analysis

This section analyzes the frequency of different part-of-speech tags in both fake and factual news articles. By comparing POS tag distributions, we can identify linguistic patterns that may distinguish fake news from factual reporting.

**What this analysis does:**
- Counts how often each grammatical category (NOUN, VERB, ADJ, etc.) appears in fake vs factual news
- Groups tokens by their part-of-speech tags to find the most common word types
- Compares linguistic patterns between fake and factual news articles
- Provides quantitative evidence of grammatical differences in news types

**Key insights:**
- Shows which grammatical categories (nouns, verbs, adjectives, etc.) are most common in each news type
- Helps identify potential linguistic markers for fake news detection
- Provides foundation for feature engineering in classification models

In [None]:
pos_counts_fake = fake_tags_df \
    .groupby(['token', 'pos_tag']) \
    .size() \
    .reset_index(name='counts') \
    .sort_values(by='counts', ascending=False)
# display the top 10 most common tokens and their part-of-speech tags
pos_counts_fake.head(10)

In [None]:
pos_counts_factual = factual_tags_df \
    .groupby(['token', 'pos_tag']) \
    .size() \
    .reset_index(name='counts') \
    .sort_values(by='counts', ascending=False)
# display the top 10 most common tokens and their part-of-speech tags
pos_counts_factual.head(10)

## POS Tag Frequency Analysis

This code analyzes the diversity of vocabulary within each part-of-speech category in fake news articles:

**What it does:**
- Groups the data by POS tags (NOUN, VERB, ADJ, etc.)
- Counts how many **unique token types** exist for each POS tag
- Sorts results in descending order to show categories with the most vocabulary diversity
- Displays the top 10 POS tags with the highest token variety

**What the results tell us:**
- Which grammatical categories have the richest vocabulary in fake news
- The linguistic complexity and diversity of different word types
- Potential indicators of writing style differences between fake and factual news

In [None]:
pos_counts_fake.groupby(['pos_tag'])['token'] \
    .count() \
    .sort_values(ascending=False)[:10]


In [None]:
pos_counts_factual.groupby(['pos_tag'])['token'] \
    .count() \
    .sort_values(ascending=False)[:10]


## Most Common Nouns Analysis

This analysis filters the data to show only NOUN tokens and identifies the most frequently used nouns in fake news articles. By examining the top 10 most common nouns, we can understand what topics and entities are most prominently discussed in fake news content.

In [None]:
pos_counts_fake[pos_counts_fake['pos_tag'] == 'NOUN'].\
    sort_values(by='counts', ascending=False)[:10]
    

In [None]:
pos_counts_factual[pos_counts_factual['pos_tag'] == 'NOUN'].\
    sort_values(by='counts', ascending=False)[:10]


# Named Entities 
This section analyzes named entities in the dataset, focusing on how often different types of entities (like PERSON, ORGANIZATION, GPE, etc.) appear in fake vs factual news articles. Named entity recognition (NER) helps identify specific people, organizations, locations, and other important entities mentioned in the text.

In [None]:
fake_tags_df.head(10)

In [None]:
# get the most common named entities in fake news
fake_ner_counts = fake_tags_df[fake_tags_df['ner_tag'] != ''] \
    .groupby(['ner_tag', 'token']) \
    .size() \
    .reset_index(name='counts') \
    .sort_values(by='counts', ascending=False)
fake_ner_counts.head(10)

In [None]:
# get the most common named entities in factual news
factual_ner_counts = factual_tags_df[factual_tags_df['ner_tag'] != ''] \
    .groupby(['ner_tag', 'token']) \
    .size() \
    .reset_index(name='counts') \
    .sort_values(by='counts', ascending=False)
factual_ner_counts.head(10)

In [None]:
# prepare a palette for the named entity visualisation in seaborn
sns.barplot(
            x='counts',
            y='token',
            hue='ner_tag',
            data=fake_ner_counts[:15],
            dodge=True,
            orient='horizontal',
            palette='Set2'
    ).set_title('Most Common Named Entities in Fake News')



# Cleanup the code

In [None]:
factual_news.head()

remove the leading name of the news agency and the name of the city. Basically everthing which starts with text and dash and space.

In [None]:
data['text clean'] = data['text'].apply(lambda x: re.sub(r'^[^-]*-\s', '', x))
data[:10]


In [None]:
data['text clean'] = data['text clean'].str.lower()
# remove punctuation
# Remove all punctuation
data['text clean'] = data.apply(lambda x: re.sub(r'[^\w\s]', '', x['text clean']), axis=1)


In [None]:
data.head(10)

In [None]:
# remove stopwords
en_stopwords = stopwords.words('english')
data['text clean'] = data['text clean'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in en_stopwords])
)


In [None]:
data.head(10)

In [None]:
# tokenize the text
data['text clean'] = data['text clean'].apply(word_tokenize)
data.head(10)

In [None]:
# lemmatization
lemetizer = WordNetLemmatizer()
nltk.download('wordnet')
data['text clean'] = data['text clean'].apply( 
    lambda tokens: [lemetizer.lemmatize(token) for token in tokens]
)
data.head(10)

In [None]:
# get the number of  tokens
tokens_clean = sum(data['text clean'], [])
print(f'Total number of tokens: {len(tokens_clean)}')
tokens_clean[:10]

In [None]:
unigrams = pd.Series(nltk.ngrams(tokens_clean, 1)).value_counts().reset_index()
unigrams.head(10)

In [None]:
bigrams = pd.Series(nltk.ngrams(tokens_clean, 2)).value_counts().reset_index()
bigrams.head(10)

# Sentiment Analysis of the fake and factual news articles

In [None]:
vader_analyzer = SentimentIntensityAnalyzer()

In [None]:
data['vader_sentiment'] = data['text'].apply(lambda x: vader_analyzer.polarity_scores(x)['compound'])
data.head(10)

In [None]:
bins=[-1, -0.1, 0.1, 1 ] # define the bins for sentiment analysis
bin_names = [ "negative", "neutral", "pozitive"]  # define the bin names
data['vader_sentiment_label'] = pd.cut(data['vader_sentiment'], bins=bins, labels=bin_names)
data.head(10)



In [None]:
# plot the distribution of sentiment labels
data['vader_sentiment_label'].value_counts().plot(kind='bar', color=default_plot_colour)
plt.title('Distribution of Sentiment Labels')
plt.xlabel('Sentiment Label')
plt.ylabel('Number of Articles')
plt.xticks(rotation=0)
plt.show()

- plot how many pozitive are the fake and factual news articles
- plot how many neutral are the fake and factual news articles
- plot how many negative are the fake and factual news articles

In [None]:
# plot the composition of positive, negative and neutral articles in each category
sns.countplot(x='fake_or_factual', hue='vader_sentiment_label', data=data, palette='Set2')
plt.title('Composition of Sentiment Labels in Each Category')
plt.xlabel('Category')
plt.ylabel('Number of Articles')
plt.xticks(rotation=0)
plt.legend(title='Sentiment Label')
plt.show()


# Topic Modeling
Topic modeling is a technique used to discover abstract topics within a collection of documents. It helps in understanding the underlying themes present in the text data. In this analysis, we will apply topic modeling to both fake and factual news articles to identify the main topics discussed in each category.

In [None]:
fake_news_text=data[data['fake_or_factual'] == 'Fake News']['text clean'].reset_index(drop=True)
fake_news_text.head(10)


In [None]:
dict_fake = corpora.Dictionary(fake_news_text)


In [None]:
bow_fake = [dict_fake.doc2bow(text) for text in fake_news_text]
doc_term_fake = [dict_fake.doc2bow(text) for text in fake_news_text] 

In [None]:
coherence_scores = []
model_list = []

min_topics = 2
max_topics = 11

In [None]:
for num_topics_i in range(min_topics, max_topics+1):
    print(f'Processing {num_topics_i} topics...')
    model = gensim.models.LdaModel(
        id2word=dict_fake, 
        num_topics=num_topics_i,
        corpus=bow_fake)
    model_list.append(model)
    coherence_model = CoherenceModel(
        model=model,
        texts=fake_news_text,
        dictionary=dict_fake,
        coherence='c_v'
    )
    coherence_scores.append(coherence_model.get_coherence())     

In [None]:
plt.plot(range(min_topics, max_topics+1), coherence_scores, marker='o')
plt.title('Coherence Scores for LSI Models')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.xticks(range(min_topics, max_topics+1))
plt.grid()
plt.show()

In [None]:
num_topics_lda = 4
lda_model = gensim.models.LdaModel(
    id2word=dict_fake, 
    num_topics=num_topics_lda,
    corpus=bow_fake)
print(lda_model.print_topics(num_words=5))

In [None]:
def tfidf_corpus(doc_term_matrix):
    """
    Converts a document-term matrix to a TF-IDF corpus.
    
    Args:
        doc_term_matrix (list): A list of tuples representing the document-term matrix.
        
    Returns:
        list: A list of tuples representing the TF-IDF corpus.
    """
    tfidf_model = TfidfModel(
        corpus = doc_term_matrix, 
        normalize=True
        )
    return tfidf_model[doc_term_matrix]

In [None]:
def get_coherence_scores(corpus, dictionary, texts, min_topics=2, max_topics=11):
    """
    Computes coherence scores for LDA models with varying number of topics.
    
    Args:
        corpus (list): The document-term matrix.
        dictionary (corpora.Dictionary): The dictionary mapping of terms to IDs.
        texts (list): The list of tokenized documents.
        min_topics (int): Minimum number of topics to evaluate.
        max_topics (int): Maximum number of topics to evaluate.
        
    Returns:
        list: Coherence scores for each number of topics.
    """
    coherence_scores = []
    model_list = []

    for num_topics_i in range(min_topics, max_topics+1):
        print(f'Processing {num_topics_i} topics...')
        model = LsiModel(
            id2word=dictionary, 
            num_topics=num_topics_i,
            corpus=corpus)
        model_list.append(model)
        coherence_model = CoherenceModel(
            model=model,
            texts=texts,
            dictionary=dictionary,
            coherence='c_v'
        )
        coherence_scores.append(coherence_model.get_coherence())
    
    plt.plot(range(min_topics, max_topics+1), coherence_scores, marker='o')
    plt.title('Coherence Scores for LSI Models')
    plt.xlabel('Number of Topics')
    plt.ylabel('Coherence Score')
    plt.xticks(range(min_topics, max_topics+1))
    plt.grid()
    plt.show()

In [None]:
corpus_tfidf_fake = tfidf_corpus(doc_term_fake)
get_coherence_scores(corpus_tfidf_fake, dict_fake, fake_news_text, min_topics=2, max_topics=11)

In [None]:
lsa_model = LsiModel(
    id2word=dict_fake, 
    num_topics=3,
    corpus=corpus_tfidf_fake
)
print(lsa_model.print_topics(num_words=5))

# Custom classifier

In [None]:
data.head(10)

In [None]:
X = [ ','.join(map(str, text)) for text in data['text clean'] ]
y = data['fake_or_factual']
X

In [None]:
countvec = CountVectorizer() # create a CountVectorizer object
countvec_fit = countvec.fit_transform(X) # fit and transform the data as a document-term matrix
# print the shape of the document-term matrix
print(f'Shape of the document-term matrix: {countvec_fit.shape}')
# show the first 10 features
print(f'First 10 features: {countvec.get_feature_names_out()[:10]}')


In [None]:
# for modeling transfrorm the data into a DataFrame
bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns=countvec.get_feature_names_out())
bag_of_words.head(10)

In [None]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    bag_of_words, y, test_size=0.3)

# Logistic Regression Classifier
Logistic Regression is a statistical method used for binary classification tasks. In this analysis, we will build a logistic regression classifier to distinguish between fake and factual news articles based on their textual features.

In [None]:
lr = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


# SGD Classifier
Stochastic Gradient Descent (SGD) Classifier is a linear classifier that uses stochastic gradient descent to optimize the model. It is particularly useful for large datasets and can handle both binary and multi-class classification tasks. In this analysis, we will implement an SGD classifier to classify news articles as fake or factual based on their content.

In [None]:
svm = SGDClassifier(random_state=0).fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred_svm)}')
print(classification_report(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))
# create a TF-IDF vectorizer