In [25]:
import numpy as np
import pandas as pd

### Data loading & preprocessing

In [26]:
fake_news = pd.read_csv('./data/Fake.csv')
true_news = pd.read_csv('./data/True.csv')

fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [102]:
# label dataset
fake_news['label'] = 0
true_news['label'] = 1

# concat two dataset
df = pd.concat([fake_news,true_news])

# drop 'subject' column
df.drop(['subject'], axis=1)

df.shape
df.reset_index(inplace=True)

In [103]:
df.head()

Unnamed: 0,index,title,text,subject,date,label
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [104]:
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/nayem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nayem/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [105]:
# list of stopwords for english
stopwords = nltk.corpus.stopwords.words("english")

In [106]:
from nltk.tokenize import word_tokenize
ps = nltk.stem.porter.PorterStemmer()
lem = nltk.stem.wordnet.WordNetLemmatizer()

def cleaning_and_processing_text(txt):
    
    # lower cased the text
    text = re.sub('[^a-zA-Z]',' ',txt)
    text = text.lower()
    
    # remove stopwords, Stemming (remove -ing, -ly, ...) and Lemmatisation
    text = text.split()
    clean_text = [lem.lemmatize(word) for word in text if word not in stopwords]
    
    text = " ".join(clean_text)
    
    return text

In [107]:
df['clean_text'] = df['text'].apply(lambda x: cleaning_and_processing_text(x))

In [108]:
df.head()

Unnamed: 0,index,title,text,subject,date,label,clean_text
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0,donald trump wish american happy new year leav...
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0,house intelligence committee chairman devin nu...
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0,friday revealed former milwaukee sheriff david...
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0,christmas day donald trump announced would bac...
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0,pope francis used annual christmas day message...


In [158]:
# sparse feature matrix from text using TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english') # params: max_features=500 can be used
vector_df = vectorizer.fit_transform(df['clean_text'])

In [159]:
vector_df.shape

(44898, 106802)

In [160]:
#print(vectorizer.get_feature_names())

In [168]:
# important features using TruncatedSVD

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=30, n_iter=7)
X = svd.fit_transform(vector_df)

In [169]:
X.shape

(44898, 30)

In [170]:
X

array([[ 0.17855113, -0.07576648,  0.04626329, ...,  0.02306624,
        -0.00756762,  0.02251456],
       [ 0.19371667, -0.06702891,  0.06899161, ..., -0.0093607 ,
        -0.00594185, -0.02933295],
       [ 0.10170041, -0.03715734,  0.01019687, ...,  0.04489363,
         0.03153795,  0.03349172],
       ...,
       [ 0.09119573,  0.04771724, -0.032919  , ...,  0.00766565,
        -0.01197312,  0.00384044],
       [ 0.11200431,  0.08713839,  0.03691338, ...,  0.01114025,
         0.00346946,  0.03648835],
       [ 0.10890611,  0.11157611,  0.00343885, ...,  0.03327088,
        -0.03055892,  0.01073539]])

### Model

### Training Model