In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Fake detection using Passive Agressive Classifier

### Data preparation

In [2]:
df = pd.read_csv("news.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [3]:
conversion_dict = {'REAL': 0, 'FAKE': 1}
df['label'] = df['label'].replace(conversion_dict)
df.label.value_counts()

0    3171
1    3164
Name: label, dtype: int64

In [4]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",1
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",1
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,0
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,0
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,0


### Words vectorize and model build

In [5]:
x_train_first,x_test_first,y_train_first,y_test_first=train_test_split(df['text'], df['label'], test_size=0.25, random_state=7, shuffle=True)
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.75 )

In [6]:
first_vec_train=tfidf_vectorizer.fit_transform(x_train_first.values.astype('U')) 
first_vec_test=tfidf_vectorizer.transform(x_test_first.values.astype('U'))
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(first_vec_train,y_train_first)

PassiveAggressiveClassifier(max_iter=50)

### Results

In [7]:
y_pred=pac.predict(first_vec_test)
score=accuracy_score(y_test_first,y_pred)
print(f'PAC Accuracy: {round(score*100,2)}%')

PAC Accuracy: 92.68%


In [8]:
confusion_matrix(y_test_first,y_pred)

array([[716,  60],
       [ 56, 752]])

In [9]:
X=tfidf_vectorizer.transform(df['text'].values.astype('U'))
scores = cross_val_score(pac, X, df['label'].values, cv=5)
print(f'K Fold Accuracy: {round(scores.mean()*100,2)}%')

K Fold Accuracy: 93.95%


### Trying another dataset

In [10]:
df_true=pd.read_csv('True.csv', nrows=2500)
df_true['label']= 0
df_true_rep=[df_true['text'][i].replace('WASHINGTON (Reuters) - ','').replace('LONDON (Reuters) - ','').replace('(Reuters) - ','') for i in range(len(df_true['text']))]
df_true['text']=df_true_rep
df_fake = pd.read_csv("Fake.csv" ,nrows=2500)
df_fake['label']= 1
df_final=pd.concat([df_true,df_fake])
df_final=df_final.drop(['subject','date'], axis=1)

In [11]:
df_final.head(10)

Unnamed: 0,title,text,label
0,"As U.S. budget fight looms, Republicans flip t...",The head of a conservative Republican faction ...,0
1,U.S. military to accept transgender recruits o...,Transgender people will be allowed for the fir...,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,The special counsel investigation of links bet...,0
3,FBI Russia probe helped by Australian diplomat...,Trump campaign adviser George Papadopoulos tol...,0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/President Donald Trump called on the U...,0
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./The White House said on ...",0
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla President Donald Trump sa...",0
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,0
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,0
9,Alabama official to certify Senator-elect Jone...,Alabama Secretary of State John Merrill said h...,0


In [12]:
df_final.tail(10)

Unnamed: 0,title,text,label
2490,REPORT: Even Republicans Are ‘Disturbed’ By T...,Donald Trump s days in the White House could b...,1
2491,Eric Trump Just Asked The Labor Department To...,"If any other American company did this, Donald...",1
2492,REPORT: Trump Considering Using National Guar...,Donald Trump reportedly wants to use the Natio...,1
2493,WATCH: Trump Assumes Black Reporter Can Set U...,After claiming that he s the least racist per...,1
2494,Trump Thinks Rush Limbaugh Is Real News And T...,Rush Limbaugh praised Donald Trump s insanely ...,1
2495,"WATCH: Fox News Host DEFENDS CNN Reporter, Te...",Even Fox News is calling Donald Trump out for ...,1
2496,GOP Senator DESPERATELY Worried About Trump’s...,"On Thursday afternoon, the world was treated t...",1
2497,WATCH: NBC Reporter Calls Trump Out For Lying...,Donald Trump finally held his first solo press...,1
2498,Tapper SLAMS Trump For BIZARRE Behavior At ‘W...,There is certainly no love lost between CNN an...,1
2499,LOL: Putin Is Angry Now Because Trump Gets Mo...,Vladimir Putin is super pissed because Russian...,1


### Model build and words vecrorize

In [13]:
x_train_second,x_test_second,y_train_second,y_test_second=train_test_split(df_final['text'], df_final['label'], test_size=0.25)
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.75 )

In [None]:
second_vec_train=tfidf_vectorizer.fit_transform(x_train_second.values.astype('U')) 
second_vec_test=tfidf_vectorizer.transform(x_test_second.values.astype('U'))
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(second_vec_train, y_train_second)

### Results

In [None]:
y_pred=pac.predict(second_vec_test)
score=accuracy_score(y_test_second,y_pred)
print(f'PAC Accuracy: {round(score*100,2)}%')

In [None]:
confusion_matrix(y_test_second,y_pred)

In [None]:
X=tfidf_vectorizer.transform(df['text'].values.astype('U'))
scores = cross_val_score(pac, X, df['label'].values, cv=5)
print(f'K Fold Accuracy: {round(scores.mean()*100,2)}%')

### Try other method(Random Forest Classfier)

In [None]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(first_vec_train, y_train_first)

In [None]:
y_pred=RFC.predict(first_vec_test)
score=accuracy_score(y_test_first,y_pred)
print(f'PAC Accuracy: {round(score*100,2)}%')

In [None]:
confusion_matrix(y_test_first,y_pred)

In [None]:
X=tfidf_vectorizer.transform(df['text'].values.astype('U'))
scores = cross_val_score(RFC, X, df['label'].values, cv=5)
print(f'K Fold Accuracy: {round(scores.mean()*100,2)}%')

### Second dataset

In [None]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(second_vec_train, y_train_second)

In [None]:
y_pred=RFC.predict(second_vec_test)
score=accuracy_score(y_test_second,y_pred)
print(f'PAC Accuracy: {round(score*100,2)}%')

In [None]:
confusion_matrix(y_test_second,y_pred)

In [None]:
X=tfidf_vectorizer.transform(df['text'].values.astype('U'))
scores = cross_val_score(RFC, X, df['label'].values, cv=5)
print(f'K Fold Accuracy: {round(scores.mean()*100,2)}%')

In [None]:
class LSTM_Text_Classifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, embedding_vector_length, max_seq_length, lstm_layers, batch_size=32, num_epochs=3, use_hash=False,
                dropout=None, conv_params=None):
        
        """
        __init__ method: creates the tokenizer for the model and saves all of the parameters
        embedding_vector_length - the length of the word vectors that will be learned by the embedding layer
        max_seq_length - the longest sequence of words that will be taken into account by the classifier (ie. 500 words)
        lstm_layers - a list with the number of LSTMs in each recurrent layer
        batch_size - the batch size used for training the model
        num_epochs - the maximum number of epochs to train for
        use_hash - whether or not to use the hashing trick for word indexing
        dropout - the dropout rate used in the dropout layers of the model
        conv_params - a dictionary with parameters for the convolutional part of the model
        """
        
        self.embedding_vector_length = embedding_vector_length
        self.max_seq_length = max_seq_length
        self.lstm_layer_sizes = lstm_layers
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.use_hashing_trick = use_hash
        if not self.use_hashing_trick:
            self.tokenizer = Tokenizer()
        self.dropout = dropout
        self.conv_params = conv_params
        
    
    def _get_word_index(self, word):
        """
        This function retrieves the index for a given word using the tokenizer.
        """
        
        try:
            return self.tokenizer.word_index[word]
        except:
            return None
        
    def _text_to_int_sequence(self, text):
        "This function converts a string of text into an integer sequence"
        
        seq = [self._get_word_index(word) for word in text_to_word_sequence(text)]
        return [index for index in seq if index]
        
    def fit(self, X, y, validation_data):
        """
        This fit function is analagous to the Scikit-learn fit function used for its estimator API.
        X (array-like) - features (text data)
        y (array-like) - target (class labels)
        validation_data - a tuple with the validation features and targets
        """
        
        all_X = pd.concat([X, validation_data[0]])
        if self.use_hashing_trick:
            all_words = set()
            for text in all_X:
                new_words = set(text_to_word_sequence(text))
                all_words = all_words.union(new_words)
            self.max_vocab = len(all_words)*1.3
            
            for i in range(len(X)):
                X[i] = hashing_trick(X[i], max_vocab, hash_function='md5')
            X_pad = sequence.pad_sequences(X, maxlen=self.max_seq_length)
            
            X_valid = validation_data[0]
            
            for i in range(len(X_valid)):
                X_valid[i] = hashing_trick(X_valid[i], max_vocab, hash_function='md5')
            X_valid_pad = sequence.pad_sequences(X_valid, maxlen=self.max_seq_length)
        
            y_valid = validation_data[1]
            
        else:    
            print('Fitting Tokenizer...')
            self.tokenizer.fit_on_texts(all_X)
            self.max_vocab = len(self.tokenizer.word_index) + 20
            X = X.apply(self._text_to_int_sequence)
            X_pad = sequence.pad_sequences(X, maxlen=self.max_seq_length)
        
            X_valid = validation_data[0].apply(self._text_to_int_sequence)
            X_valid_pad = sequence.pad_sequences(X_valid, maxlen=self.max_seq_length)
        
            y_valid = validation_data[1]
        
        self.model = Sequential()
        self.model.add(Embedding(self.max_vocab, self.embedding_vector_length, input_length=self.max_seq_length))
            
        if self.conv_params is not None:
            use_pooling = False
            if self.conv_params['pool_size'] is not None:
                use_pooling = True
            
            for i in range(self.conv_params['n_layers']):
                self.model.add(Conv1D(filters=2*(i+1)*self.conv_params['filters'], 
                                      kernel_size=self.conv_params['kernel_size'], 
                                      padding='same', activation='relu'))
                if use_pooling:
                    self.model.add(MaxPooling1D(pool_size=self.conv_params['pool_size']))

            
        if len(self.lstm_layer_sizes) > 1:
            for lstm_layer_size in self.lstm_layer_sizes[:-1]:
                self.model.add(LSTM(lstm_layer_size, return_sequences=True))
                self.model.add(Dropout(self.dropout))
            self.model.add(LSTM(self.lstm_layer_sizes[-1]))
        else:
            self.model.add(LSTM(self.lstm_layer_sizes[0]))
        if self.dropout is not None:
            self.model.add(Dropout(self.dropout))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        early_stopping = EarlyStopping(monitor='val_accuracy',
                              min_delta=0,
                              patience=1,
                              verbose=2, mode='max')
        
        checkpoint = ModelCheckpoint(filepath='best_model',
                                     monitor='val_accuracy',
                                     mode='max',
                                     save_best_only=True)
        
        callbacks_list = [early_stopping, checkpoint]
        print(self.model.summary())
        
        print('Fitting model...')
        self.model.fit(X_pad, y, validation_data=(X_valid_pad, y_valid), 
                  epochs=self.num_epochs, batch_size=self.batch_size, callbacks=callbacks_list)
        
        
    def predict(self, X):
        """
        This function is analagous to the Scikit-learn predict function used for its estimator API. It first preprocesses the text
        data and converts it into an integer sequence.
        X (array-like) - input text data
        """
        if type(X) == pd.core.frame.DataFrame or type(X) == pd.core.series.Series:
            X = X.apply(self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict(X)
        elif type(X) == str:
            X = self._text_to_int_sequence(X)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict(X)
        else:
            X = map(X, self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict(X)
        
    def predict_classes(self, X):
        """
        This function is a wrapper over the keras predict_classes method.It first preprocesses the text data and converts it into
        an integer sequence.
        """
        
        if type(X) == pd.core.frame.DataFrame or type(X) == pd.core.series.Series:
            X = X.apply(self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict_classes(X)
        elif type(X) == str:
            X = self._text_to_int_sequence(X)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict_classes(np.array(X))
        else:
            X = map(X, self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict_classes(np.array(X))
    
    def predict_proba(self, X):
          """
        This function is a wrapper over the keras predict_proba method.It first preprocesses the text data and converts it into
        an integer sequence.
        """        
        
        if type(X) == pd.core.series.Series:
            X = X.apply(self._text_to_int_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict_proba(X)
        elif type(X) == str:
            X = self._text_to_int_sequence(X)
            X = sequence.pad_sequence(X, maxlen = self.max_seq_length)
            return self.model.predict_proba(np.array(X))
        else:
            X = map(X, self._text_to_word_sequence)
            X = sequence.pad_sequences(X, maxlen = self.max_seq_length)
            return self.model.predict_proba(np.array(X))
    
    def load_model(self, file_path):
        "This function is a wrapper over the Keras load_model function"
        
        self.model = load_model(file_path)
    
    def score(self, X, y):
        "This function is a wrapper over the Scikit-learn score function for its estimator API."
        
        pred = self.predict(X)
        return accuracy_score(y, pred)