In [None]:
import pandas as pd
import numpy as np
import nltk
import string
from gensim.models import Word2Vec

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
fake_data = pd.read_csv('data/Fake.csv')
fake_data['label'] = 0
fake_data.info()

In [None]:
true_data = pd.read_csv('data/True.csv')
true_data['label'] = 1
true_data.info()

In [None]:
fake_data.isnull().sum()

In [None]:
true_data.isnull().sum()

In [None]:
fake_data['subject'].value_counts()

In [None]:
true_data['subject'].value_counts()

In [None]:
merged_data = pd.concat((fake_data, true_data))

In [None]:
merged_data = merged_data.drop('date', axis=1)

In [None]:
merged_data['text'] = merged_data['subject'] + ' ' + merged_data['title'] + ' ' + merged_data['text']
merged_data = merged_data.drop('subject', axis=1).drop('title', axis=1)

In [None]:
random_permutation = np.random.permutation(len(merged_data))
merged_data = merged_data.iloc[random_permutation]

In [None]:
merged_data = merged_data.head(1000)

In [None]:
merged_data.info()

In [None]:
merged_data['label'].value_counts()

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def preprocess_text(text):
    ret_text = text.translate(text.maketrans({x:'' for x in string.punctuation})).lower() # lowercase; remove punctuation
    ret_text = nltk.tokenize.word_tokenize(ret_text)
    ret_text = [word for word in ret_text if word not in stopwords]
    ret_text = [lemmatizer.lemmatize(word) for word in ret_text]
    return  ret_text
    

In [None]:
merged_data['text'] = merged_data['text'].apply(preprocess_text)

In [None]:
w2v_text = Word2Vec(merged_data['text'], vector_size=100, workers=8)

In [None]:
def vectorize(words, model):
    words_vecs = [model.wv[word] for word in words if word in model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [None]:
merged_data['text'] = merged_data['text'].apply(lambda x: vectorize(x, w2v_text))

In [None]:
merged_data['text'].head()

In [None]:
merged_data.head()