In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords


In [68]:
# Load datasets
fake_politifact = pd.read_csv('politifact_fake.csv')
real_politifact = pd.read_csv('politifact_real.csv')
fake_gossipcop = pd.read_csv('gossipcop_fake.csv')
real_gossipcop = pd.read_csv('gossipcop_real.csv')

# Assign labels: 0 for fake news, 1 for real news
fake_politifact['label'] = 0
real_politifact['label'] = 1
fake_gossipcop['label'] = 0
real_gossipcop['label'] = 1

# Concatenate datasets
data = pd.concat([fake_politifact, real_politifact, fake_gossipcop, real_gossipcop], axis=0, ignore_index=True)


In [69]:
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Convert text to lowercase and tokenize words
    words = text.lower().split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Join words back into a single string
    cleaned_text = ' '.join(words)
    
    return cleaned_text

data['cleaned_title'] = data['title'].apply(clean_text)


In [70]:
# Split data into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [71]:
train_data.shape

(18556, 6)

In [72]:
train_data.head()


Unnamed: 0,id,news_url,title,tweet_ids,label,cleaned_title
9198,gossipcop-873539,https://www.etonline.com/news/223794_samuel_l_...,Samuel L. Jackson Schools James Corden in 'Dro...,897450356900114432\t897450895167729664\t897451...,1,samuel l jackson schools james corden drop mic...
14985,gossipcop-853226,https://www.nydailynews.com/new-york/nyc-crime...,Alleged Taylor Swift stalker waited on roof of...,864840870826090498\t864840973523623936\t864841...,1,alleged taylor swift stalker waited roof manha...
15100,gossipcop-850005,https://medium.com/@AlexChaney303138N/from-hop...,From Hopeful Australia Beginnings to a Tragic ...,864785411520049153\t864786362523820038\t864786...,1,hopeful australia beginnings tragic hollywood ...
6186,gossipcop-4516431341,variety.com/2017/legit/awards/tony-awards-2017...,Tony Awards 2017: Complete Winners List,873270018309447680\t873271704667205632\t873272...,0,tony awards complete winners list
11038,gossipcop-891466,https://en.wikipedia.org/wiki/Disappearance_of...,Disappearance of Madeleine McCann,926802996821970944\t926803044611829760\t926804...,1,disappearance madeleine mccann


In [73]:
test_data.shape

(4640, 6)

In [74]:
test_data.head()

Unnamed: 0,id,news_url,title,tweet_ids,label,cleaned_title
4291,gossipcop-3430607867,www.today.com/video/mariah-careys-twins-steal-...,Mariah Carey’s twins steal spotlight at Hollyw...,629276787600560128\t629286113706795009\t629326...,0,mariah carey twins steal spotlight hollywood w...
14922,gossipcop-930854,https://people.com/tv/teen-mom-bombshells/,Teen Mom's Most Bombshell and Dramatic Moments,991304765010333696\t991305082875662336\t991305...,1,teen mom bombshell dramatic moments
19868,gossipcop-911475,https://www.vanityfair.com/hollywood/2018/02/s...,S.N.L.: Watch Natalie Portman Rap About Star W...,960156003277856768\t960175040087150592\t960178...,1,n l watch natalie portman rap star wars preque...
12669,gossipcop-847921,https://www.springfieldspringfield.co.uk/view_...,The Arrangement (2017) s02e02 Episode Script,859786149417504770\t859786174889472000\t859787...,1,arrangement e episode script
12570,gossipcop-886689,https://en.paperblog.com/the-platinum-life-rec...,The Platinum Life Recap: The Ladies Take a Tri...,919764599016058880\t919764869817266176\t919765...,1,platinum life recap ladies take trip vegas aly...


In [75]:
def load_glove_vectors(glove_file):
    word_vectors = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors

glove_vectors = load_glove_vectors('glove.6B.50d.txt')


In [76]:
def weighted_average_vectors(docs, vectorizer, word_vectors, dim=50):
    # Calculate the tf-idf weights for the given documents
    tfidf_matrix = vectorizer.fit_transform(docs)

    # Initialize an empty matrix to store the weighted average vectors
    weighted_vectors = np.zeros((len(docs), dim))

    # Iterate through the documents and compute the weighted average vector for each
    for i, doc in enumerate(docs):
        words = doc.split()
        weighted_sum = np.zeros(dim)
        total_weight = 0

        for word in words:
            if word in word_vectors and word in vectorizer.vocabulary_:
                vector = word_vectors[word]
                weight = tfidf_matrix[i, vectorizer.vocabulary_[word]]
                weighted_sum += weight * vector
                total_weight += weight

        if total_weight != 0:
            weighted_vectors[i] = weighted_sum / total_weight

    return weighted_vectors


In [77]:
vectorizer = TfidfVectorizer(max_features=10000)
X_train = weighted_average_vectors(train_data['cleaned_title'], vectorizer, glove_vectors)
X_test = weighted_average_vectors(test_data['cleaned_title'], vectorizer, glove_vectors)

y_train = train_data['label'].values
y_test = test_data['label'].values
