In [20]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
df = pd.read_csv("../nlp_dataset/raw/fake_or_real_news.csv")
df = df.set_index("Unnamed: 0") 
df.head()

Unnamed: 0_level_0,title,text,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Text Preprocessing

In [22]:
df.text = df.text.str.lower()
df.text = df.text.str.replace(r'[^\.\w\s]','') #remove everything but characters and punctuation
df.text = df.text.str.replace(r'\.\.+','.') #replace multple periods with a single one
df.text = df.text.str.replace(r'\.',' . ') #replace multple periods with a single one
df.text = df.text.str.replace(r'\s\s+',' ') #replace multple white space with a single one
df.text = df.text.str.strip() 

In [23]:
def get_text_length(x):
    return len(x.split())

df['text_lenght']= df['text'].apply(get_text_length)
print('Average number of words for input text', df['text_lenght'].mean())
print('Minimum number of words for input text', df['text_lenght'].min())
print('Maximum number of words for input text', df['text_lenght'].max())

Average number of words for input text 812.597790055
Minimum number of words for input text 0
Maximum number of words for input text 22157


In [24]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=53)
print('Train data dims', X_train.shape, y_train.shape)
print('Test data dims', X_test.shape, y_test.shape)

Train data dims (4434,) (4434,)
Test data dims (1901,) (1901,)


In [7]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#### Preprocessing: Count vectors and tdidf vectors

In [25]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train) 
count_test = count_vectorizer.transform(X_test)

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [26]:
print('tfidf dataset dims',tfidf_train.shape)
print('count dataset dims',count_train.shape)

tfidf dataset dims (4434, 72739)
count dataset dims (4434, 72739)


In [27]:
print(tfidf_vectorizer.get_feature_names()[-10:])
print(tfidf_vectorizer.get_feature_names()[:10])

['تنجح', 'حلب', 'عربي', 'عن', 'لم', 'ما', 'محاولات', 'من', 'هذا', 'والمرضى']
['00', '000', '0000', '00000031', '000035', '00006', '0001pt', '0008', '000oz', '0011']


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = ["Nearest Neighbors","Multinomial Naive Bayes", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net"]

classifiers = [
    KNeighborsClassifier(3),
    MultinomialNB(),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1)]

In [29]:
for name, clf in zip(names, classifiers):
    pred_count = clf.fit(count_train, y_train).predict(count_test)
    pred_tdfidf = clf.fit(tfidf_train, y_train).predict(tfidf_test)
    score_count = metrics.accuracy_score(y_test,pred_count)
    score_tdidf = metrics.accuracy_score(y_test, pred_tdfidf)
    print("Classifier {0}, Accuracy (count): {1}, Accuracy (tdfidf): {2}".format(name, score_count,score_tdidf))
    #cm = metrics.confusion_matrix(y_test, pred_tdfidf, labels=['FAKE', 'REAL'])
    #plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

Classifier Nearest Neighbors, Accuracy (count): 0.8106259863229879, Accuracy (tdfidf): 0.5476065228826933
Classifier Multinomial Naive Bayes, Accuracy (count): 0.9005786428195687, Accuracy (tdfidf): 0.855339295107838
Classifier Linear SVM, Accuracy (count): 0.9153077327722251, Accuracy (tdfidf): 0.7201472908995266
Classifier RBF SVM, Accuracy (count): 0.5165702261967385, Accuracy (tdfidf): 0.9032088374539716
Classifier Decision Tree, Accuracy (count): 0.7822198842714361, Accuracy (tdfidf): 0.7985270910047344
Classifier Random Forest, Accuracy (count): 0.4960547080483956, Accuracy (tdfidf): 0.6012624934245134
Classifier Neural Net, Accuracy (count): 0.9237243556023146, Accuracy (tdfidf): 0.9163598106259864
