In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from gensim.models import Word2Vec

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/pepino/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pepino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pepino/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
fake_data = pd.read_csv('data/Fake.csv')
fake_data['label'] = 0
fake_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
 4   label    23481 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 917.4+ KB


In [4]:
fake_data['label'].value_counts()

label
0    23481
Name: count, dtype: int64

In [5]:
true_data = pd.read_csv('data/True.csv')
true_data['label'] = 1
true_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
 4   label    21417 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 836.7+ KB


In [6]:
true_data.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [7]:
fake_data.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [8]:
fake_data['subject'].value_counts()

subject
News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: count, dtype: int64

In [9]:
merged_data = pd.concat((fake_data, true_data))

In [10]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [11]:
merged_data['label'].value_counts()

label
0    23481
1    21417
Name: count, dtype: int64

In [12]:
text_data = merged_data['text']

In [13]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [14]:
def preprocess_text(text):
    ret_text = text.translate(text.maketrans({x:'' for x in string.punctuation})).lower()
    ret_text = nltk.tokenize.word_tokenize(ret_text)
    
    ret_text = [word for word in ret_text if word not in stopwords]
    
    ret_text = [lemmatizer.lemmatize(word) for word in ret_text]
    return  ret_text
    

In [15]:
text_data = text_data.apply(preprocess_text)

In [16]:
w2v_model = Word2Vec(text_data, workers=8)

In [17]:
def vectorize(words):
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [18]:
text_data = text_data.apply(vectorize)

In [19]:
text_data.head()

0    [0.00069583696, 0.26985732, 0.30838522, 0.2379...
1    [-0.22554708, 0.37048438, 0.45297512, -0.35747...
2    [-0.27931008, 0.23285268, 0.08940452, 0.102074...
3    [0.027688442, 0.33475715, 0.16851616, 0.062404...
4    [-0.7529672, -0.09911489, -0.15522246, 0.09069...
Name: text, dtype: object

In [20]:
merged_data['text'] = text_data

In [21]:
merged_data.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,"[0.00069583696, 0.26985732, 0.30838522, 0.2379...",News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,"[-0.22554708, 0.37048438, 0.45297512, -0.35747...",News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"[-0.27931008, 0.23285268, 0.08940452, 0.102074...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"[0.027688442, 0.33475715, 0.16851616, 0.062404...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,"[-0.7529672, -0.09911489, -0.15522246, 0.09069...",News,"December 25, 2017",0


In [22]:
merged_data['title'] = merged_data['title'].apply(preprocess_text)

In [23]:
merged_data.head()

Unnamed: 0,title,text,subject,date,label
0,"[donald, trump, sends, embarrassing, new, year...","[0.00069583696, 0.26985732, 0.30838522, 0.2379...",News,"December 31, 2017",0
1,"[drunk, bragging, trump, staffer, started, rus...","[-0.22554708, 0.37048438, 0.45297512, -0.35747...",News,"December 31, 2017",0
2,"[sheriff, david, clarke, becomes, internet, jo...","[-0.27931008, 0.23285268, 0.08940452, 0.102074...",News,"December 30, 2017",0
3,"[trump, obsessed, even, obama, ’, name, coded,...","[0.027688442, 0.33475715, 0.16851616, 0.062404...",News,"December 29, 2017",0
4,"[pope, francis, called, donald, trump, christm...","[-0.7529672, -0.09911489, -0.15522246, 0.09069...",News,"December 25, 2017",0


In [24]:
w2v_title = Word2Vec(merged_data['title'], workers=8)

In [25]:
def vectorize_title(words):
    words_vecs = [w2v_title.wv[word] for word in words if word in w2v_title.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [26]:
merged_data['title'] = merged_data['title'].apply(vectorize_title)

In [27]:
one_hot_subject = pd.get_dummies(merged_data['subject'], dtype=int)
merged_data = pd.concat([merged_data, one_hot_subject], axis=1)
merged_data = merged_data.drop('subject', axis=1)
merged_data.head()

Unnamed: 0,title,text,date,label,Government News,Middle-east,News,US_News,left-news,politics,politicsNews,worldnews
0,"[-0.3422175, -0.0039491192, 0.27373362, 0.5655...","[0.00069583696, 0.26985732, 0.30838522, 0.2379...","December 31, 2017",0,0,0,1,0,0,0,0,0
1,"[-0.17137605, 0.09518932, 0.1198794, 0.4953371...","[-0.22554708, 0.37048438, 0.45297512, -0.35747...","December 31, 2017",0,0,0,1,0,0,0,0,0
2,"[-0.1602251, 0.02819395, 0.17087828, 0.4432479...","[-0.27931008, 0.23285268, 0.08940452, 0.102074...","December 30, 2017",0,0,0,1,0,0,0,0,0
3,"[-0.2713425, -0.058272712, 0.378775, 0.7219259...","[0.027688442, 0.33475715, 0.16851616, 0.062404...","December 29, 2017",0,0,0,1,0,0,0,0,0
4,"[-0.26579103, -0.12048226, 0.4016078, 0.688106...","[-0.7529672, -0.09911489, -0.15522246, 0.09069...","December 25, 2017",0,0,0,1,0,0,0,0,0
