## Import


In [38]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import keras
from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Input, RepeatVector
from keras.optimizers import SGD
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from utils import getEmbeddings
import os
import nltk
nltk.download('popular')
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

## Load and prepare data

In [0]:
if not os.path.isfile('./xtr.npy') or \
    not os.path.isfile('./xte.npy') or \
    not os.path.isfile('./ytr.npy') or \
    not os.path.isfile('./yte.npy'):
    xtr,xte,ytr,yte = getEmbeddings("fakeReal.csv")
    np.save('./xtr', xtr)
    np.save('./xte', xte)
    np.save('./ytr', ytr)
    np.save('./yte', yte)

xtr = np.load('./xtr.npy')
xte = np.load('./xte.npy')
ytr = np.load('./ytr.npy')
yte = np.load('./yte.npy')

# Naive Bayes Model

In [29]:
gnb = GaussianNB()
gnb.fit(xtr,ytr)
y_pred = gnb.predict(xte)
m = yte.shape[0]
n = (yte != y_pred).sum()
print("Accuracy = " + format((m-n)/m*100, '.2f') + "%")

Accuracy = 79.79%


# Support Vector Machine (SVM)

In [30]:
clf = SVC()
clf.fit(xtr, ytr)
y_pred = clf.predict(xte)
m = yte.shape[0]
n = (yte != y_pred).sum()
print("Accuracy = " + format((m-n)/m*100, '.2f') + "%") ;



Accuracy = 85.40%


# Neural net with Keras

## Costruct model

In [0]:
def create_model():
    '''Neural network with 3 hidden layers'''
    model = Sequential()
    model.add(Dense(256, input_dim=300, activation='relu', kernel_initializer='normal'))
    model.add(Dropout(0.3))
    model.add(Dense(256, activation='relu', kernel_initializer='normal'))
    model.add(Dropout(0.5))
    model.add(Dense(80, activation='relu', kernel_initializer='normal'))
    model.add(Dense(2, activation="softmax", kernel_initializer='normal'))

    # gradient descent
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    
    # configure the learning process of the model
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

## Model intialization

In [32]:
model = create_model()
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 256)               77056     
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 80)                20560     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 162       
Total params: 163,570
Trainable params: 163,570
Non-trainable params: 0
_________________________________________________________________


## Split data for train and test, label encoding

In [33]:
x_train, x_test, y_train, y_test = train_test_split(xtr, ytr, test_size=0.2, random_state=42)
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

LabelEncoder()

## Converts a class vector (integers) to binary class matrix.

In [0]:
encoded_y = np_utils.to_categorical((label_encoder.transform(y_train)))
label_encoder.fit(y_test)
encoded_y_test = np_utils.to_categorical((label_encoder.transform(y_test)))

## Model fitting

In [35]:
estimator = model.fit(x_train, encoded_y, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Check results

In [36]:
score = model.evaluate(x_test, encoded_y_test)
print("Accuracy = " + format(score[1]*100, '.2f') + "%")   

Accuracy = 90.43%


In [66]:
x_test[100].reshape(-1,1).shape

(300, 1)

In [70]:
x_test[85].reshape(1,-1)

array([[ 1.53250873e-01, -4.43218686e-02, -3.31966043e-01,
        -1.37849927e-01, -3.70268196e-01, -2.12997824e-01,
        -1.06189713e-01, -1.87523533e-02,  7.77944252e-02,
        -2.73529440e-01,  2.64761239e-01,  1.71496272e-01,
        -5.68335801e-02, -1.09186783e-01,  1.66402847e-01,
         1.16373330e-01,  3.52148980e-01, -1.38787895e-01,
        -3.43667120e-01, -3.94975811e-01, -2.02459767e-01,
         2.54456811e-02, -8.30528364e-02, -4.76050451e-02,
        -1.13141023e-01, -2.82045722e-01, -2.28973299e-01,
        -2.44740695e-01, -2.67489165e-01,  5.27436852e-01,
         1.73873872e-01, -2.06270233e-01, -1.98520869e-01,
         2.44539246e-01, -6.07778072e-01, -2.29894802e-01,
        -1.21521614e-01, -4.76822108e-01, -8.37116539e-02,
         1.72706619e-01, -6.71446174e-02, -1.39223590e-01,
        -2.55039990e-01,  5.82374215e-01,  3.54930282e-01,
        -5.54421186e-01, -2.71880720e-03, -2.63425767e-01,
         3.68882507e-01,  3.94606501e-01, -1.40710577e-0

In [0]:
df = pd.read_csv('fakeReal.csv')
x_train_, x_test, y_train, y_test = train_test_split(xtr, ytr, test_size=0.2, random_state=42)

In [69]:
model.predict_classes(x_test[85].reshape(1,-1), batch_size=64, verbose=1)



array([1])