In [1]:
import re
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the Dataset

train_stances = pd.read_csv("https://raw.githubusercontent.com/mightyTathagata/fake_news_project/main/fnc_dataset/train_stances.csv") #headlines
train_bodies = pd.read_csv("https://raw.githubusercontent.com/mightyTathagata/fake_news_project/main/fnc_dataset/train_bodies.csv") #body
test_stances = pd.read_csv("https://raw.githubusercontent.com/mightyTathagata/fake_news_project/main/fnc_dataset/test_stances.csv")
test_bodies = pd.read_csv("https://raw.githubusercontent.com/mightyTathagata/fake_news_project/main/fnc_dataset/test_bodies.csv")

# Inner Join on the Body ID
merged_train_data = pd.merge(train_stances, train_bodies, on="Body ID")
merged_test_data = pd.merge(test_stances, test_bodies, on="Body ID")

# Shuffle the Dataset
merged_train_data = merged_train_data.sample(frac=0.3)
merged_test_data = merged_test_data.sample(frac=0.3)

In [3]:
# Feature Label Split

y_train = merged_train_data['Stance']
x_train = merged_train_data.drop('Stance', axis=1)
y_test = merged_test_data['Stance']
x_test = merged_test_data.drop('Stance', axis=1)


In [4]:
# Train Test Split

#x_train = x[:int(0.8 * len(x))]
#x_test = x[int(0.8 * len(x)):]
#y_train =  y[:int(0.8 * len(x))]
#y_test = y[int(0.8 * len(x)):]

print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

14992 14992
7624 7624


In [5]:
# Declaring the function
def process_text(text):

    # Lowercasing
    text = text.lower()

    # Remove some punctuations
    text = re.sub(r"[!?,'\"*)@#%(&$_.^-]", '', text)

    # Splitting on spaces
    text = text.split(' ')

    # Stemming and removing spaces
    stemmer_ps = nltk.stem.PorterStemmer()  
    text = [stemmer_ps.stem(word) for word in text if len(word)]

    return " ".join(text)


In [6]:
# Transforming Headlines
x_train['Headline'] = x_train['Headline'].apply(process_text)
x_test['Headline'] = x_test['Headline'].apply(process_text)

# Transforming Body
x_train['articleBody'] = x_train['articleBody'].apply(process_text)
x_test['articleBody'] = x_test['articleBody'].apply(process_text)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfv_headline = TfidfVectorizer(max_features=2500)

tfv_headline.fit((x_train['Headline']))
xtrain_tfv_headline =  tfv_headline.transform(x_train['Headline'])


xtest_tfv_headline =  tfv_headline.transform(x_test['Headline'])

tfv_body = TfidfVectorizer(max_features=2500)

tfv_body.fit((x_train['articleBody']))
xtrain_tfv_body =  tfv_body.transform(x_train['articleBody'])

xtest_tfv_body =  tfv_body.transform(x_test['articleBody'])



In [9]:
print(xtrain_tfv_headline.shape)
print(xtest_tfv_headline.shape)
print(xtrain_tfv_body.shape)
print(xtest_tfv_body.shape)

(14992, 2500)
(7624, 2500)
(14992, 2500)
(7624, 2500)


In [11]:
from scipy.sparse import hstack
xtrain_tfv = hstack([xtrain_tfv_headline, xtrain_tfv_body]).toarray()

xtest_tfv = hstack([xtest_tfv_headline, xtest_tfv_body]).toarray()


In [12]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(xtrain_tfv, y_train)
predictions = clf.predict(xtest_tfv)



In [13]:
from sklearn.metrics import f1_score
f1_score(y_test, predictions, average='weighted')

0.6104640322636589

In [14]:
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition

svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xtest_svd = svd.transform(xtest_tfv)

scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_scl = scl.transform(xtrain_svd)
xtest_scl = scl.transform(xtest_svd)


clf = SVC(C=1.0, probability=True)
clf.fit(xtrain_scl, y_train)
predictions_svm = clf.predict(xtest_scl)

f1_score(y_test, predictions_svm, average='weighted')

0.6723799876315201

In [16]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 10, random_state = 42)
clf.fit(xtrain_tfv, y_train)
predictions_rf = clf.predict(xtest_tfv)


In [17]:
f1_score(y_test, predictions_rf, average='weighted')

0.6084206858799869