In [13]:
import pandas as pd
import numpy as np
import nltk
import string
from gensim.models import Word2Vec

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/pepino/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pepino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pepino/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
fake_data = pd.read_csv('data/Fake.csv')
fake_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [8]:
fake_data.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [10]:
fake_data['subject'].value_counts()

subject
News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: count, dtype: int64

In [6]:
text_data = fake_data['text']

In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [10]:
def preprocess_text(text):
    ret_text = text.translate(text.maketrans({x:'' for x in string.punctuation})).lower()
    ret_text = nltk.tokenize.word_tokenize(ret_text)
    
    ret_text = [word for word in ret_text if word not in stopwords]
    
    ret_text = [lemmatizer.lemmatize(word) for word in ret_text]
    return  ret_text
    

In [11]:
text_data.apply(preprocess_text)

0        [donald, trump, wish, american, happy, new, ye...
1        [house, intelligence, committee, chairman, dev...
2        [friday, revealed, former, milwaukee, sheriff,...
3        [christmas, day, donald, trump, announced, wou...
4        [pope, francis, used, annual, christmas, day, ...
                               ...                        
23476    [21st, century, wire, say, 21wire, reported, e...
23477    [21st, century, wire, say, familiar, theme, wh...
23478    [patrick, henningsen, 21st, century, wireremem...
23479    [21st, century, wire, say, al, jazeera, americ...
23480    [21st, century, wire, say, 21wire, predicted, ...
Name: text, Length: 23481, dtype: object

In [12]:
w2v_model = Word2Vec(text_data, workers=8)

In [14]:
def vectorize(words):
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [15]:
vectorized_text = np.array([vectorize(text) for text in text_data])

In [16]:
vectorized_text[0]

array([-0.20599341, -0.18279746,  0.06426024,  0.02924897,  0.01051747,
        0.07881936, -0.00945119, -0.08691448, -0.11760957,  0.13824245,
        0.337986  , -0.25673094,  0.08301435,  0.35641083, -0.02837672,
       -0.16140357, -0.09759199,  0.08263341, -0.2565607 ,  0.15675977,
       -0.20730709, -0.07702197, -0.03360847,  0.06426428,  0.04803052,
        0.04713533, -0.01464768, -0.18048653, -0.06347931, -0.02550424,
       -0.27732596, -0.28717333,  0.10904673, -0.03640343, -0.16679528,
        0.09269678, -0.20375031,  0.298335  ,  0.02986906,  0.10674539,
        0.17615448,  0.01299509,  0.14655872, -0.01648579, -0.16239044,
       -0.13980223, -0.02039683,  0.05612905,  0.30446732, -0.04834163,
       -0.02777252, -0.02853289, -0.13975878,  0.2670723 , -0.19308819,
       -0.18816082,  0.3539325 , -0.09065714, -0.11162925, -0.09930728,
        0.02661319,  0.17520085,  0.21521918, -0.23841184,  0.05120436,
       -0.12134799, -0.03087408, -0.12464127,  0.12926085, -0.06