In [12]:
import numpy as pd
import pandas as pd
from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn import decomposition, ensemble
import textblob, string

In [13]:
df = pd.read_csv("./data/raw.csv")
df = df.set_index("id")

In [3]:
## Get the Independent Features
X = df.drop("label",axis=1)
X.head()

Unnamed: 0_level_0,title,author,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [4]:
## Get the Dependent features
y = df["label"]
y.head()

id
0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [5]:
# drop null values
df = df.dropna()
df.isnull().sum()

title     0
author    0
text      0
label     0
dtype: int64

In [6]:
#creating a variable called length for the length of the text
length = []
[length.append(len(str(text))) for text in df['text']]
df['length'] = length
df.head()

#dropping length less than 50
df['text'][df['length'] < 50]

id
82                                                        
169                                                       
173                                        Guest   Guest  
295                                                       
470                                                       
                               ...                        
20264                                                     
20348    \n\nMindblowing Reason Elites Fear Donald Trum...
20418                                      Guest   Guest  
20431         \nOctober 28, 2016 The Mothers by stclair by
20513                                                     
Name: text, Length: 107, dtype: object

In [7]:
messages=df.copy()
messages.reset_index(inplace=True)
messages.head(10)
messages['title'][6]

'Benoît Hamon Wins French Socialist Party’s Presidential Nomination - The New York Times'

In [8]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub("[^a-zA-Z]", " ", messages["title"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)

# Bag of Words

In [9]:
#bag of words with ngram for more meaning from the document
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
X_bow = cv.fit_transform(corpus).toarray()
X_bow.shape
y = messages["label"]

In [10]:
## Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, y, test_size=0.25, random_state=0)
cv.get_feature_names()[:20]

['abandon',
 'abc',
 'abc news',
 'abduct',
 'abe',
 'abedin',
 'abl',
 'abort',
 'abroad',
 'absolut',
 'abstain',
 'absurd',
 'abus',
 'abus new',
 'abus new york',
 'academi',
 'accept',
 'access',
 'access pipelin',
 'access pipelin protest']

In [11]:
## gives you the various parameters of count vectorizers
cv.get_params()
train_bow_df = pd.DataFrame(X_train_bow, columns=cv.get_feature_names())
train_bow_df.head()

Unnamed: 0,abandon,abc,abc news,abduct,abe,abedin,abl,abort,abroad,absolut,...,zero,zika,zika viru,zionist,zone,zone new,zone new york,zoo,zu,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
test_bow_df = pd.DataFrame(X_test_bow, columns=cv.get_feature_names())
test_bow_df.head()

Unnamed: 0,abandon,abc,abc news,abduct,abe,abedin,abl,abort,abroad,absolut,...,zero,zika,zika viru,zionist,zone,zone new,zone new york,zoo,zu,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TFIDF

In [13]:
#using TFIDF which down weights unimportant words that appear with high frequency
tfidf_v = TfidfVectorizer(max_features=5000,ngram_range=(1,3))
X_tfid = tfidf_v.fit_transform(corpus).toarray()
y = messages["label"]

In [14]:
#splitting
X_train_tfid, X_test_tfid, y_train_tfid, y_test_tfid = train_test_split(X_tfid, y, test_size=0.25, random_state=0)
tfidf_v.get_feature_names()[:20]

['abandon',
 'abc',
 'abc news',
 'abduct',
 'abe',
 'abedin',
 'abl',
 'abort',
 'abroad',
 'absolut',
 'abstain',
 'absurd',
 'abus',
 'abus new',
 'abus new york',
 'academi',
 'accept',
 'access',
 'access pipelin',
 'access pipelin protest']

In [15]:
#visualise
train_tfidf_df = pd.DataFrame(X_train_tfid, columns=cv.get_feature_names())
train_tfidf_df.head()

Unnamed: 0,abandon,abc,abc news,abduct,abe,abedin,abl,abort,abroad,absolut,...,zero,zika,zika viru,zionist,zone,zone new,zone new york,zoo,zu,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.31335,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#visualise
test_tfidf_df = pd.DataFrame(X_test_tfid, columns=cv.get_feature_names())
test_tfidf_df.head()

Unnamed: 0,abandon,abc,abc news,abduct,abe,abedin,abl,abort,abroad,absolut,...,zero,zika,zika viru,zionist,zone,zone new,zone new york,zoo,zu,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Word embeddings

In [17]:
#word embeddings are an improvement over more the traditional bag-of-word model encoding schemes where large sparse vectors were used to represent each word or to score each word within a vector to represent an entire vocabulary. These representations were sparse because the vocabularies were vast and a given word or document would be represented by a large vector comprised mostly of zero values.
#instead, in an embedding, words are represented by dense vectors where a vector represents the projection of the word into a continuous vector space.
#better fit for cnn otherwise input would be a 1x5000 matrix for bow
#this splits data into positive and negative, train and test files to be used in word embeddings later
df["data"] = 'title: ' + df['title'].astype(str) \
        + ' author: ' + df['author'].astype(str) \
        + ' text: ' + df['text'].astype(str) 
positive_examples = df.query('label==1').drop(['title', 'author', 'text', 'label'], axis=1)
from sklearn.model_selection import train_test_split
positive_examples_train, positive_examples_test = train_test_split(positive_examples, test_size=0.25)

In [18]:
negative_examples = df.query('label==0').drop(['title', 'author', 'text', 'label'], axis=1)
from sklearn.model_selection import train_test_split
negative_examples_train, negative_examples_test = train_test_split(negative_examples, test_size=0.25)

In [19]:
_, positive_examples_small = train_test_split(positive_examples_test, test_size=0.05)
_, negative_examples_small = train_test_split(negative_examples_test, test_size=0.05)

# Output

In [21]:
y_train_bow = y_train_bow.reset_index().drop("index", axis = 1)
train_bow_df["isFakeNews"] = y_train_bow
train_bow_df.to_csv("./data/train_bow.csv", index = False)

In [22]:
y_train_tfid = y_train_tfid.reset_index().drop("index", axis = 1)
train_tfidf_df["isFakeNews"] = y_train_tfid
train_tfidf_df.to_csv("./data/train_tfidf.csv", index = False)

In [23]:
y_test_bow = y_test_bow.reset_index().drop("index", axis = 1)
test_bow_df["isFakeNews"] = y_test_bow
test_bow_df.to_csv("./data/test_bow.csv", index = False)

In [None]:
positive_examples.to_csv("./data/positive_examples.csv", index = False)
negative_examples.to_csv("./data/negative_examples.csv", index = False)

In [None]:
train_tfidf_df.to_csv("./data/negative_examples.csv", index = False)

In [24]:
y_test_tfid = y_test_tfid.reset_index().drop("index", axis = 1)
test_tfidf_df["isFakeNews"] = y_test_tfid
test_tfidf_df.to_csv("./data/test_tfidf.csv", index = False)

In [20]:
positive_examples_train.to_csv("./data/positive_examples_train.txt", sep=',', encoding='utf-8', header=None, index=False)
positive_examples_test.to_csv("./data/positive_examples_test.txt", sep=',', encoding='utf-8', header=None, index=False)
negative_examples_train.to_csv("./data/negative_examples_train.txt", sep=',', encoding='utf-8', header=None, index=False)
negative_examples_test.to_csv("./data/negative_examples_test.txt", sep=',', encoding='utf-8', header=None, index=False)

In [21]:
positive_examples_small.to_csv("./data/positive_examples_small.txt", sep=',', encoding='utf-8', header=None, index=False)
negative_examples_small.to_csv("./data/negative_examples_small.txt", sep=',', encoding='utf-8', header=None, index=False)

Instructions for the modelling: we have 2 sets of training and validation data, 1 is count and the other is tfid, use the respective X and Y train/valid data sets.

I have chosen a 75/25 train test split according the size of the data and googling online. Change it if you find something more suitable.

Preprocessing summary: removed null values, removed blank/short meaningless text. Removed stop words and non-english characters. We end up with 2 set of vectors that we can work with.