In [13]:
import pandas as pd
import numpy as np
import nltk
import string
from gensim.models import Word2Vec

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/pepino/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pepino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pepino/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
fake_data = pd.read_csv('data/Fake.csv')
fake_data['label'] = 0
fake_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
 4   label    23481 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 917.4+ KB


In [28]:
fake_data['label'].value_counts()

label
0    23481
Name: count, dtype: int64

In [23]:
true_data = pd.read_csv('data/True.csv')
true_data['label'] = 1
true_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
 4   label    21417 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 836.7+ KB


In [18]:
true_data.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [8]:
fake_data.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [10]:
fake_data['subject'].value_counts()

subject
News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: count, dtype: int64

In [32]:
merged_data = pd.concat((fake_data, true_data))

In [33]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [34]:
merged_data['label'].value_counts()

label
0    23481
1    21417
Name: count, dtype: int64

In [35]:
text_data = merged_data['text']

In [36]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [37]:
def preprocess_text(text):
    ret_text = text.translate(text.maketrans({x:'' for x in string.punctuation})).lower()
    ret_text = nltk.tokenize.word_tokenize(ret_text)
    
    ret_text = [word for word in ret_text if word not in stopwords]
    
    ret_text = [lemmatizer.lemmatize(word) for word in ret_text]
    return  ret_text
    

In [44]:
text_data = text_data.apply(preprocess_text)

In [45]:
w2v_model = Word2Vec(text_data, workers=8)

In [46]:
def vectorize(words):
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [47]:
text_data = text_data.apply(vectorize)

In [48]:
text_data.head()

0    [-0.1945455, 0.49657086, 0.45725614, 0.0299222...
1    [-0.18309607, 0.2572843, 0.3594153, -0.3372905...
2    [-0.087496184, 0.19574566, 0.19793516, 0.05111...
3    [0.12665501, 0.29168501, 0.28672382, 0.0294175...
4    [-0.546719, -0.15392952, -0.16862026, 0.208190...
Name: text, dtype: object

In [49]:
merged_data['text'] = text_data

In [50]:
merged_data.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,"[-0.1945455, 0.49657086, 0.45725614, 0.0299222...",News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,"[-0.18309607, 0.2572843, 0.3594153, -0.3372905...",News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"[-0.087496184, 0.19574566, 0.19793516, 0.05111...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"[0.12665501, 0.29168501, 0.28672382, 0.0294175...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,"[-0.546719, -0.15392952, -0.16862026, 0.208190...",News,"December 25, 2017",0


In [51]:
merged_data['title'] = merged_data['title'].apply(preprocess_text)

In [53]:
merged_data.head()

Unnamed: 0,title,text,subject,date,label
0,"[donald, trump, sends, embarrassing, new, year...","[-0.1945455, 0.49657086, 0.45725614, 0.0299222...",News,"December 31, 2017",0
1,"[drunk, bragging, trump, staffer, started, rus...","[-0.18309607, 0.2572843, 0.3594153, -0.3372905...",News,"December 31, 2017",0
2,"[sheriff, david, clarke, becomes, internet, jo...","[-0.087496184, 0.19574566, 0.19793516, 0.05111...",News,"December 30, 2017",0
3,"[trump, obsessed, even, obama, ’, name, coded,...","[0.12665501, 0.29168501, 0.28672382, 0.0294175...",News,"December 29, 2017",0
4,"[pope, francis, called, donald, trump, christm...","[-0.546719, -0.15392952, -0.16862026, 0.208190...",News,"December 25, 2017",0


In [54]:
w2v_title = Word2Vec(merged_data['title'], workers=8)

In [55]:
def vectorize_title(words):
    words_vecs = [w2v_title.wv[word] for word in words if word in w2v_title.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [56]:
merged_data['title'] = merged_data['title'].apply(vectorize_title)

In [57]:
merged_data.head()

Unnamed: 0,title,text,subject,date,label
0,"[-0.42373195, -0.109749556, 0.25004047, 0.4188...","[-0.1945455, 0.49657086, 0.45725614, 0.0299222...",News,"December 31, 2017",0
1,"[-0.15220425, 0.024585318, 0.07699719, 0.36577...","[-0.18309607, 0.2572843, 0.3594153, -0.3372905...",News,"December 31, 2017",0
2,"[-0.26125762, -0.018562233, 0.23732944, 0.3288...","[-0.087496184, 0.19574566, 0.19793516, 0.05111...",News,"December 30, 2017",0
3,"[-0.34957463, -0.17358625, 0.37578568, 0.54758...","[0.12665501, 0.29168501, 0.28672382, 0.0294175...",News,"December 29, 2017",0
4,"[-0.40124622, -0.19426323, 0.33676836, 0.53367...","[-0.546719, -0.15392952, -0.16862026, 0.208190...",News,"December 25, 2017",0
