# Classifiers

## Preparation

In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, LSTM, SimpleRNN, GRU, Bidirectional
# from transformers import BertTokenizer, TFBertForSequenceClassification
# from transformers import InputExample, InputFeatures

nltk.data.path.append("D:\\Environment\\nltk_data")

### Data reading/cleaning

In [2]:
df_train = pd.read_csv('../data/WELFake_clean_train.csv')
df_test = pd.read_csv('../data/WELFake_clean_test.csv')
df_train.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,45905,Exclusive: Foreign Isis Fighters Defend Mosul ...,\nForeign fighters for Isis are choosing to ...,1
1,37291,JUDGE JEANINE UNLOADS On Hillary: “How Did You...,You don t want to miss a second of Judge Jeani...,1
2,46730,Gunman attacks Saudi security forces at gate o...,RIYADH (Reuters) - Two Saudi guards were shot ...,0
3,66327,Indian Software Mogul: Hire Americans Now Beca...,A leading Indian software entrepreneur says In...,0
4,58329,Rep. Diaz-Balart: Liberals Against Trump Who F...,Florida Congressman Mario attacked the “doub...,0


In [3]:
df_test.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,50990,BLM Rapper & Bill’s Alleged Son Have Nasty Sur...,BLM Rapper & Bill’s Alleged Son Have Nasty Sur...,1
1,41146,US Airstrike Killed Five Al-Qaeda Members in Y...,Get short URL 0 2 0 0 The US military killed f...,1
2,48389,"Zach Johnson, Pieters share lead at Firestone",(Reuters) - Late birdies from Thomas Pieters a...,0
3,55759,Re: WOW! What Josh Earnest admitted about Obam...,WOW! What Josh Earnest admitted about Obamacar...,1
4,21412,Memorial Day provides respite from VA controve...,Memorial Day is a time to remember those who g...,0


In [4]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    return text

In [5]:
df_train['title'] = df_train['title'].apply(clean_text)
df_train['text'] = df_train['text'].apply(clean_text)
df_test['title'] = df_test['title'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

df_train['title'] = df_train['title'].apply(word_tokenize)
df_train['text'] = df_train['text'].apply(word_tokenize)
df_test['title'] = df_test['title'].apply(word_tokenize)
df_test['text'] = df_test['text'].apply(word_tokenize)

stop_words = set(stopwords.words('english'))
df_train['title'] = df_train['title'].apply(lambda x: [word for word in x if word not in stop_words])
df_train['text'] = df_train['text'].apply(lambda x: [word for word in x if word not in stop_words])
df_test['title'] = df_test['title'].apply(lambda x: [word for word in x if word not in stop_words])
df_test['text'] = df_test['text'].apply(lambda x: [word for word in x if word not in stop_words])

lemmatizer = WordNetLemmatizer()
df_train['title'] = df_train['title'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])
df_train['text'] = df_train['text'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])
df_test['title'] = df_test['title'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])
df_test['text'] = df_test['text'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

df_train['joined'] = df_train['title'].apply(lambda x: ' '.join(x)) + ' ' + df_train['text'].apply(lambda x: ' '.join(x))
df_test['joined'] = df_test['title'].apply(lambda x: ' '.join(x)) + ' ' + df_test['text'].apply(lambda x: ' '.join(x))

X_train = df_train['joined']
y_train = df_train['label']
X_test = df_test['joined']
y_test = df_test['label']

vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train

## Standard Machine Learning Methods

### Logistic Regression

In [None]:
def train_logistic_regression(X_train, y_train):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [None]:
lr_model = train_logistic_regression(X_train_tfidf, y_train)
evaluate_model(lr_model, X_test_tfidf, y_test)

Accuracy: 0.9538809237940532
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      6907
           1       0.95      0.96      0.96      7252

    accuracy                           0.95     14159
   macro avg       0.95      0.95      0.95     14159
weighted avg       0.95      0.95      0.95     14159



### Naive Bayes

In [None]:
def train_naive_bayes(X_train, y_train):
    model = MultinomialNB()
    model.fit(X_train, y_train)
    return model

In [None]:
nb_model = train_naive_bayes(X_train_tfidf, y_train)
evaluate_model(nb_model, X_test_tfidf, y_test)

Accuracy: 0.8529557172116675
              precision    recall  f1-score   support

           0       0.86      0.83      0.85      6907
           1       0.85      0.87      0.86      7252

    accuracy                           0.85     14159
   macro avg       0.85      0.85      0.85     14159
weighted avg       0.85      0.85      0.85     14159



### Support Vector Machines

In [None]:
def train_svm(X_train, y_train):
    model = SVC(kernel='linear')
    model.fit(X_train, y_train)
    return model

In [None]:
svm_model = train_svm(X_train_tfidf, y_train)
evaluate_model(svm_model, X_test_tfidf, y_test)

### Random Forest

In [None]:
def train_random_forest(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

In [None]:
rf_model = train_random_forest(X_train_tfidf, y_train)
evaluate_model(rf_model, X_test_tfidf, y_test)

KeyboardInterrupt: 

## Deep Learning

### Data prep

In [None]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
maxlen = 500
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

### Convolutional NN

In [None]:
def train_cnn(X_train, y_train):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=maxlen))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)
    return model

def evaluate_dl_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [None]:
cnn_model = train_cnn(X_train_pad, y_train)
evaluate_dl_model(cnn_model, X_test_pad, y_test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Accuracy: 0.9334698778162299
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      6907
           1       0.94      0.92      0.93      7252

    accuracy                           0.93     14159
   macro avg       0.93      0.93      0.93     14159
weighted avg       0.93      0.93      0.93     14159



### Recurrent NN

In [None]:
def train_rnn(X_train, y_train):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=maxlen))
    model.add(SimpleRNN(128))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)
    return model

In [None]:
rnn_model = train_rnn(X_train_pad, y_train)
evaluate_dl_model(rnn_model, X_test_pad, y_test)

Epoch 1/15
 74/708 [==>...........................] - ETA: 3:15 - loss: 0.6210 - accuracy: 0.6590

KeyboardInterrupt: 

### LSTM NN

In [None]:
def train_lstm(X_train, y_train):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=maxlen))
    model.add(LSTM(128))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)
    return model

In [None]:
lstm_model = train_lstm(X_train_pad, y_train)
evaluate_dl_model(lstm_model, X_test_pad, y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.9200508510488029
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      6907
           1       0.93      0.91      0.92      7252

    accuracy                           0.92     14159
   macro avg       0.92      0.92      0.92     14159
weighted avg       0.92      0.92      0.92     14159

