In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import string

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

### Data loading & preprocessing

In [2]:
fake_news = pd.read_csv('./data/Fake.csv')
true_news = pd.read_csv('./data/True.csv')

fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [3]:
# label dataset
fake_news['label'] = 0
true_news['label'] = 1

# concat two dataset
df = pd.concat([fake_news,true_news], axis=0)

# drop 'subject' column
df.drop('subject', axis=1, inplace=True)

df = df.sample(frac=1).reset_index(drop=True)

df['text'] = df['title'] + ' ' + df['text']

In [4]:
# df.shape
df.head()

Unnamed: 0,title,text,date,label
0,Turkish foreign minister says joint operation ...,Turkish foreign minister says joint operation ...,"September 26, 2017",1
1,Bush Once Apologized To China In Order To Fre...,Bush Once Apologized To China In Order To Fre...,"January 15, 2016",0
2,LOL! NEW VIDEO Emerges Of Central Park Trump A...,LOL! NEW VIDEO Emerges Of Central Park Trump A...,"Jul 6, 2017",0
3,Trump says Mexican imports tax one option but ...,Trump says Mexican imports tax one option but ...,"January 27, 2017",1
4,Madeleine Albright: Trump Needs To Stay The F...,Madeleine Albright: Trump Needs To Stay The F...,"May 2, 2017",0


In [5]:
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# list of stopwords for english
stopwords = nltk.corpus.stopwords.words("english")

In [7]:
ps = nltk.stem.porter.PorterStemmer()
lem = nltk.stem.wordnet.WordNetLemmatizer()

def delete_punctuation(text):
    chars = []
    for char in text:
        if char not in string.punctuation:
            chars.append(char)
        else:
            chars.append(' ')
    return ''.join(chars)




def cleaning_and_processing_text(txt):
    
    # lower cased the text
    text = re.sub('[^a-zA-Z]',' ',txt)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('\\W', ' ', text)
    text = re.sub('\n', '', text)
    text = re.sub(' +', ' ', text)
    text = re.sub(' $', '', text)
    text = delete_punctuation(text)
    text = text.lower()
    
    # remove stopwords, Stemming (remove -ing, -ly, ...) and Lemmatisation
    text = text.split()
    clean_text = [lem.lemmatize(word) for word in text if word not in stopwords]
    clean_text = [ps.stem(word) for word in clean_text]
    
    text = " ".join(clean_text)
    
    return text

In [8]:
df['clean_text'] = df['text'].apply(lambda x: cleaning_and_processing_text(x))

In [9]:
df.head()

Unnamed: 0,title,text,date,label,clean_text
0,Turkish foreign minister says joint operation ...,Turkish foreign minister says joint operation ...,"September 26, 2017",1,turkish foreign minist say joint oper iraq tab...
1,Bush Once Apologized To China In Order To Fre...,Bush Once Apologized To China In Order To Fre...,"January 15, 2016",0,bush apolog china order free detain u soldier ...
2,LOL! NEW VIDEO Emerges Of Central Park Trump A...,LOL! NEW VIDEO Emerges Of Central Park Trump A...,"Jul 6, 2017",0,lol new video emerg central park trump assassi...
3,Trump says Mexican imports tax one option but ...,Trump says Mexican imports tax one option but ...,"January 27, 2017",1,trump say mexican import tax one option other ...
4,Madeleine Albright: Trump Needs To Stay The F...,Madeleine Albright: Trump Needs To Stay The F...,"May 2, 2017",0,madelein albright trump need stay f ck away ki...


In [10]:
df['clean_text'][:1].values

array(['turkish foreign minist say joint oper iraq tabl referendum ankara reuter turkey ass request made iraqi central govern wake iraqi kurdish independ referendum includ joint oper iraq turkish foreign minist mevlut cavusoglu said tuesday evalu request iraq everyth includ joint oper tabl said interview broadcast kanal ad reason turkey close habur border gate iraq'],
      dtype=object)

In [11]:
# sparse feature matrix from text using TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english') # params: max_features=500 can be used
vector_df = vectorizer.fit_transform(df['clean_text'])

In [12]:
vector_df.shape

(44898, 89469)

In [13]:
# print(vectorizer.get_feature_names())

In [14]:
# important features using TruncatedSVD

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=1000, n_iter=7)
truncated_x = pd.DataFrame(svd.fit_transform(vector_df))

In [15]:
truncated_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.091031,0.132482,-0.029735,-0.072650,0.037826,-0.069727,0.093701,0.017928,-0.107694,0.011955,...,0.022111,0.017383,-0.009884,0.009920,0.021859,0.015478,-0.002446,0.015010,-0.009024,-0.011870
1,0.209207,0.085917,0.021894,-0.049114,-0.055749,0.036453,-0.027331,0.004205,-0.025145,0.024601,...,-0.002890,-0.004037,-0.018505,-0.020299,-0.003295,0.003077,-0.015787,0.002142,0.010265,-0.014022
2,0.165406,-0.095752,0.082599,-0.041463,-0.061021,-0.035573,-0.032436,0.027905,0.017745,-0.030353,...,0.004260,-0.010542,0.006757,-0.003340,0.005709,0.022158,-0.003485,-0.015867,-0.001940,-0.010859
3,0.270206,-0.047351,0.077081,0.097433,-0.099924,-0.100079,0.074297,0.096302,0.028359,0.109901,...,-0.014890,-0.001317,-0.003443,-0.033675,0.012974,-0.008207,0.046135,0.004722,-0.016815,-0.028489
4,0.212460,0.012887,0.157728,0.033680,-0.083061,0.027632,-0.012635,-0.003381,0.006598,-0.010978,...,0.014134,0.014665,-0.003674,0.008843,0.000285,0.011260,0.009747,0.006747,0.007141,-0.014397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44893,0.073633,0.040172,0.005879,-0.019897,0.039950,-0.015253,-0.011610,0.011194,-0.013124,0.032894,...,-0.003643,0.003609,-0.006583,0.020252,0.019157,-0.016508,-0.015691,-0.002139,-0.004252,-0.005073
44894,0.153520,-0.076832,-0.015291,-0.073766,0.311429,0.156731,-0.077113,0.080659,0.108633,0.086829,...,0.003137,-0.015648,0.006894,-0.002009,0.021484,-0.004356,-0.015263,0.000187,-0.022967,0.019397
44895,0.368518,-0.234429,0.069732,0.008039,0.051450,0.191439,0.210409,0.022763,-0.085624,0.112158,...,-0.004880,-0.001634,-0.021154,-0.001816,-0.010452,0.000137,0.015298,0.009101,0.002958,0.011659
44896,0.162447,-0.016473,-0.042122,-0.045544,-0.060031,0.031685,-0.035133,0.013472,-0.003036,-0.046678,...,0.012457,0.001782,0.009167,0.001312,0.002301,0.009072,0.009053,0.012590,-0.010859,-0.022344


In [324]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(truncated_x, df['label'], test_size=0.20, random_state=10)

# display(X_train.head())
# print('\n')
# display(y_train.head())
y_train = y_train.values.reshape([y_train.shape[0], -1]).T
# X_train_flatten = X_train.values.reshape([X_train.shape[0], -1]).T
# X_train = X_train_flatten/255
# y_test = pd.DataFrame(y_test.values.reshape(y_test.shape[0], -1).T)
print("\ny_train shape {}. ".format(y_train.shape))
print("\ny_train type {}. ".format(type(y_train)))
print("\ny_train dimension {}. ".format(y_train.ndim))
print("\nX_train shape {}. ".format(X_train.shape))
print("\nX_train type {}. ".format(type(X_train)))
print("\nX_train dimension {}. ".format(X_train.ndim))

# print("\nThere are {} documents in the training data.".format(len(X_train)))
# print("\nThere are {} documents in the test data.".format(len(X_test)))


y_train shape (1, 35918). 

y_train type <class 'numpy.ndarray'>. 

y_train dimension 2. 

X_train shape (35918, 1000). 

X_train type <class 'pandas.core.frame.DataFrame'>. 

X_train dimension 2. 


### Model

In [345]:
# Leaky Relu implementation
def leaky_relu(Z):
    A = np.where(Z > 0, Z, Z * 0.01)

# Faster method than the above but will test later
#     A1 = ((Z>0) * Z)
#     A2 = ((Z<=0) * Z * 0.01)
#     A = A1 + A2

    assert(A.shape == Z.shape)

    cache = Z 
    return A, cache


def leaky_relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
 
    dZ[Z <= 0] = 0.01
    assert (dZ.shape == Z.shape)

    return dZ

# Sigmoid 
def sigmoid(Z):
    A = 1/(1+np.exp(-Z))
    cache = Z

    return A, cache

def sigmoid_backward(dA, cache):
    Z = cache

    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)

    assert (dZ.shape == Z.shape)

    return dZ

# Relu
def relu(Z):
#     print("\nshape of Z is {}".format(Z.shape))
#     print("type of Z: " + str(type(Z)))
#     print("dimension of Z: " + str(Z.ndim))
    A = np.maximum(0,Z)

    assert(A.shape == Z.shape)

    cache = Z 
    return A, cache


def relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.

    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0

    assert (dZ.shape == Z.shape)

    return dZ

In [346]:
# Initializing Parameters W and b

def initialize_parameters(layer_dims):
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)
    
    for layer in range(1, L):
        parameters["W" + str(layer)] = np.random.randn(layer_dims[layer], layer_dims[layer-1]) * 0.01
        parameters["b" + str(layer)] = np.zeros((layer_dims[layer], 1))
        
        assert(parameters["W" + str(layer)].shape == (layer_dims[layer], layer_dims[layer-1]))
        assert(parameters["b" + str(layer)].shape == (layer_dims[layer], 1))
    
    return parameters



# Forward Propagation with Linear function, activation functions

def linear_func_forward(A, W, b):
#     print("\nshape of A is {}".format(A.shape))
#     print("type of A: " + str(type(A)))
#     print("dimension of A: " + str(A.ndim))
#     print("\nshape of W is {}".format(W.shape))
#     print("type of W: " + str(type(W)))
#     print("dimension of W: " + str(W.ndim))
    Z= np.dot(W, A) + b
    cache = (A, W, b)
    
    return Z, cache

## Activation functions Sigmoid/leaky_Relu based on the layers
def activation_func(A_prev, W, b, actviation_choice):

    if actviation_choice == "sigmoid":
        Z, linear_cache = linear_func_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
        
    elif actviation_choice == "relu":
        Z, linear_cache = linear_func_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
        
    elif actviation_choice == "leaky_relu":
        Z, linear_cache = linear_func_forward(A_prev, W, b)
        A, activation_cache = leaky_relu(Z)
        
    cache = (linear_cache, activation_cache)
    
    return A, cache

def forward_propagation(X, parameters):
    caches = []
    A = X
    L = len(parameters) // 2
    print("\nnumber of layers is {}".format(L))
    
    for layer in range(1, L):
        A_prev = A
        A, cache = activation_func(A_prev, parameters['W'+str(layer)], parameters['b'+str(layer)], actviation_choice="relu")
        caches.append(cache)
#         print("\nshape of A is {}".format(A.shape))
    
    AL, cache = activation_func(A, parameters['W'+str(L)], parameters['b'+str(L)], actviation_choice="sigmoid")
    caches.append(cache)
    
    return AL, caches


# Computing cost after the forward propagtion
def compute_cost(AL, Y):
    m = Y.shape[1]
    
#     print("\ntype of AL {}".format(type(AL)))
#     print("\ntype of Y {}".format(type(Y)))
    cost = -1/m* np.sum((np.dot(Y, np.log(AL).T)) + (np.dot((1-Y), np.log(1-AL).T)))
    cost = np.squeeze(cost)
    
    return cost

# Backward Propagation with linear function, activation functions
def linear_func_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = 1/m * np.dot(dZ, A_prev.T)
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

def actvtion_func_backward(dA, cache, activation):
    linear_cache, activation_cache = cache
    
    if activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_func_backward(dZ, linear_cache)
        
    elif activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_func_backward(dZ, linear_cache)
    
    elif activation == "leaky_relu":
        dZ = leaky_relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_func_backward(dZ, linear_cache)
        
    return dA_prev, dW, db

def backward_propagation(AL, Y, caches):
    grads = {}
    L = len(caches) 
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    dAL = - (np.divide(Y, AL) - np.divide(1-Y, 1-AL))
    
    current_cache = caches[L-1]
    dA_prev_temp, dW_temp, db_temp = actvtion_func_backward(dAL, current_cache, "sigmoid")
    grads["dA" + str(L-1)] = dA_prev_temp
    grads["dW" + str(L)] = dW_temp
    grads["db" + str(L)] = db_temp
    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = actvtion_func_backward(grads["dA" + str(l+1)], current_cache, "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l+1)] = dW_temp
        grads["db" + str(l+1)] = db_temp
    
    return grads

# After backward propagation update parameters function is called
def update_parameters(params, grads, learning_rate):
    parameters = params.copy()
    L = len(parameters) // 2
    
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)]-learning_rate*grads["dW" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)]-learning_rate*grads["db" + str(l + 1)]
        
        
    return parameters


In [347]:
def plot_costs(costs, learning_rate=0.0075):
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

### Training Model

In [348]:
layers_dims = [35918, 40, 20, 10, 5, 1]

In [349]:
def training_model(X, Y, layer_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost = False):
    np.random.seed(1)
    costs=[]
    
    parameters = initialize_parameters(layer_dims)
    
    for i in range(0, num_iterations):
        AL, caches = forward_propagation(X, parameters)
        print("\nshape of AL is {}".format(AL.shape))
        print("type of AL: " + str(type(AL)))
        print("dimension of AL: " + str(AL.ndim))
        
        cost = compute_cost(AL, Y)
        
        grads = backward_propagation(AL, Y, caches)
        
        parameters = update_parameters(parameters, grads, learning_rate)
        
        if print_cost and i % 100 == 0 or i == num_iterations - 1:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
        if i % 100 == 0 or i == num_iterations:
            costs.append(cost)
    
    return parameters, costs

In [350]:
parameters, costs = training_model(X_train, y_train, layers_dims, num_iterations = 200, print_cost = True)


number of layers is 5

shape of AL is (1, 1000)
type of AL: <class 'numpy.ndarray'>
dimension of AL: 2


ValueError: shapes (1,35918) and (1000,1) not aligned: 35918 (dim 1) != 1000 (dim 0)

In [289]:
y_train.shape

(1, 35918)

In [194]:
parameters = initialize_parameters(layers_dims)
# parameters["b1"].shape