In [None]:
import pandas as pd
import numpy as np
import pickle
import zipfile
import requests
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV

### Import data

In [55]:
import requests, zipfile, io
r = requests.get("https://nlp.stanford.edu/data/glove.6B.zip")
print('passed request')
z = zipfile.ZipFile(io.BytesIO(r.content))
print('got zip')
z.extractall('glove.6B.50d.txt')

KeyboardInterrupt: 

In [16]:
df = pd.read_csv("../nlp_dataset/raw/fake_or_real_news.csv")
df = df.set_index("Unnamed: 0") 
df.head()

Unnamed: 0_level_0,title,text,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Map words to glove embeddings 

In [17]:
embeddings_dims = 300

with open('../models/embeddings/glove.6B.'+str(embeddings_dims)+'d.txt','rb') as f:
    word_entries = f.readlines()
    
glove_weights = np.zeros((len(word_entries), embeddings_dims))
words = []
for i, entry in enumerate(word_entries):
    word_weights = entry.split()
    word = word_weights[0]
    words.append(word)
    weights = word_weights[1:]
    glove_weights[i] = np.array([float(w) for w in weights])
word_vocab = [w.decode("utf-8") for w in words]
word2vec = dict(zip(word_vocab, glove_weights))

#### Text preprocessing

In [18]:
df.title = df.title.str.lower()
df.text = df.text.str.lower()
df.title = df.title.str.replace(r'[^\.\w\s]','') #remove everything but characters and punctuation
df.text = df.text.str.replace(r'[^\.\w\s]','') #remove everything but characters and punctuation
df.title = df.title.str.replace(r'\.\.+','.') #replace multple periods with a single one
df.text = df.text.str.replace(r'\.\.+','.') #replace multple periods with a single one
df.title = df.title.str.replace(r'\.',' . ') #replace periods with a single one
df.text = df.text.str.replace(r'\.',' . ') #replace multple periods with a single one
df.title = df.title.str.replace(r'\s\s+',' ') #replace multple white space with a single one
df.text = df.text.str.replace(r'\s\s+',' ') #replace multple white space with a single one
df.title = df.title.str.strip() 
df.text = df.text.str.strip() 

def tokenize(text):
    text = text.split()
    return text
    
def remove_stopwords(tokens):
    new_tokens =  [token for token in tokens if token not in stopwords.words('english')]
    return new_tokens

def transform(df, word2vec):
    tokens_list_text = df['text_tokens']
    embeddings_average = []
    for text in tokens_list_text:
        emb_mean_text = np.mean([word2vec[word] for word in text if word in word2vec] or [np.zeros(embeddings_dims)], axis=0)
        embeddings_average.append(emb_mean_text)
    return np.array(embeddings_average)

### Build features

In [19]:
df['text_tokens'] = df['text'].apply(tokenize)
#df['title_tokens'] = df['title'].apply(tokenize).apply(remove_stopwords)
X = transform(df, word2vec)
y = (df['label']=='REAL').astype(int)

In [20]:
print('Input data dimensions', X.shape, y.shape)

Input data dimensions (6335, 300) (6335,)


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=53)

### Fit and evaluate models

In [22]:
names = ["Nearest Neighbors","Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net", 'ExtraTreesClassifier']

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=8, n_estimators=100),
    MLPClassifier(alpha=1),
    ExtraTreesClassifier(n_estimators=200)]

In [23]:
for name, clf in zip(names, classifiers):
    clf_trained = clf.fit(X_train, y_train)
    pred = clf_trained.predict(X_test)
    score = metrics.accuracy_score(y_test, pred)
    print("Classifier {0}, Accuracy {1}".format(name, round(score,2)))

Classifier Nearest Neighbors, Accuracy 0.84
Classifier Linear SVM, Accuracy 0.79
Classifier RBF SVM, Accuracy 0.9
Classifier Decision Tree, Accuracy 0.75
Classifier Random Forest, Accuracy 0.86
Classifier Neural Net, Accuracy 0.85
Classifier ExtraTreesClassifier, Accuracy 0.87


### RBF Hyparameter Selection

In [51]:
gamma_range = [1, 2, 4]
C_range = [1.5, 2, 3, 4]
parameters = [{'gamma': gamma_range, 'C': C_range}]
grid = GridSearchCV(estimator=SVC(), param_grid=parameters, cv=5)
grid.fit(X_train, y_train)
best_gamma = grid.best_params_['gamma']
best_C = grid.best_params_['C']
print('Best parameters for Support Vector Regressor: C =',best_C,'gamma =',best_gamma)

KeyboardInterrupt: 

In [38]:
# Model Training
SVM_rbf = SVC(C=2,gamma=2)
SVM_rbf.probability=True
SVM_rbf.fit(X_train,y_train)
pred = SVM_rbf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print('Accuracy',score)

Accuracy 0.905303030303


### Serialize RBF classifier

In [39]:
SVM_rbf = SVC(C=2,gamma=2)
SVM_rbf.probability=True
SVM_rbf.fit(X,y)
filename = '../models/classifiers/rbf_emdedding_300.pkl'
pickle.dump(SVM_rbf, open(filename, 'wb'))

probs[3]