In [1]:
import re
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

In [4]:
# Load the Dataset

headlines = pd.read_csv("../fake_news_project/fnc_dataset/train_stances.csv")
body = pd.read_csv("../fake_news_project/fnc_dataset/train_bodies.csv")

# Inner Join on the Body ID
merged_raw_data = pd.merge(headlines, body, on="Body ID")

# Shuffle the Dataset
merged_raw_data = merged_raw_data.sample(frac=0.1)

In [5]:
# Feature Label Split

y = merged_raw_data['Stance']
x = merged_raw_data.drop('Stance', axis=1)

In [6]:
# Train Test Split

x_train = x[:int(0.8 * len(x))]
x_test = x[int(0.8 * len(x)):]
y_train =  y[:int(0.8 * len(x))]
y_test = y[int(0.8 * len(x)):]

print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

3997 3997
1000 1000


In [7]:
# Declaring the function
def process_text(text):

    # Lowercasing
    text = text.lower()

    # Remove some punctuations
    text = re.sub(r"[!?,'\"*)@#%(&$_.^-]", '', text)

    # Splitting on spaces
    text = text.split(' ')

    # Stemming and removing spaces
    stemmer_ps = nltk.stem.PorterStemmer()  
    text = [stemmer_ps.stem(word) for word in text if len(word)]

    return text

# Transforming Headlines
x_train['Headline'] = x_train['Headline'].apply(process_text)
x_test['Headline'] = x_test['Headline'].apply(process_text)

# Transforming Body
x_train['articleBody'] = x_train['articleBody'].apply(process_text)
x_test['articleBody'] = x_test['articleBody'].apply(process_text)

In [8]:
# Convert to Numpy Arrays
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [11]:
x_train

array([[list(['christian', 'bale', 'in', 'talk', 'to', 'play', 'steve', 'job', 'exclus']),
        1368,
        list(['jeni', 'chang', 'for', 'buzzfe', '/', 'via', 'drug', 'polici', 'allianc', 'data\n\nnew', 'york', 'citi', '—', 'the', 'new', 'york', 'polic', 'depart', 'will', 'no', 'longer', 'arrest', 'peopl', 'for', 'lowlevel', 'marijuana', 'possess', 'accord', 'to', 'peopl', 'with', 'knowledg', 'of', 'the', 'city’', 'drug', 'strategi', 'who', 'spoke', 'on', 'condit', 'of', 'anonymity\n\nth', 'nypd', 'will', 'issu', 'violat', 'summons', 'to', 'peopl', 'caught', 'with', 'marijuana', '—', 'instead', 'of', 'put', 'them', 'in', 'handcuff', 'and', 'take', 'them', 'to', 'a', 'precinct', 'the', 'summons', 'will', 'requir', 'peopl', 'to', 'appear', 'in', 'court', 'at', 'a', 'later', 'date', 'and', 'pay', 'a', 'fine\n\nfew', 'detail', 'on', 'the', 'new', 'polici', 'were', 'immedi', 'avail', 'but', 'an', 'offici', 'with', 'one', 'of', 'the', 'city’', 'five', 'district', 'attorney', 'offic', '

In [None]:
lr_model = LogisticRegression()


In [None]:
#using Tf-idf application from kaggle

tfv = TfidfVectorizer(
    min_df=3,  
    max_features=None, 
    strip_accents='unicode', 
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 3), 
    use_idf=1,
    smooth_idf=1,
    sublinear_tf=1,
    stop_words = 'english'
)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)