# Importing libs

In [None]:
!pip install catboost
!pip install texthero
!pip install scikit-plot
!pip install --upgrade tables
!pip install keras_bert
!pip install livelossplot
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
seed = 0
np.random.seed(0)
import math
import tensorflow as tf
from tensorflow import keras

import catboost as ctb
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

from keras_bert import load_trained_model_from_checkpoint
from transformers import BertTokenizer

import tensorflow.keras.backend as K
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPool2D,Input
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPool1D, Dropout, BatchNormalization, Bidirectional, Flatten
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import AUC, Accuracy
import tensorflow.keras.preprocessing.text as kpt 
from tensorflow.keras.utils import to_categorical
from livelossplot import PlotLossesKeras
import scikitplot as skplt
import texthero as hero
import joblib
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate, cross_val_predict

from sklearn.metrics import f1_score, auc
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

from sklearn.metrics import auc, roc_auc_score
from sklearn.dummy import DummyClassifier
from gensim.models import FastText, Word2Vec
from gensim.utils import simple_preprocess, to_utf8
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
%matplotlib inline

# Reading data

In [None]:
X_train_title = np.load('input/train_X_title_uncased_L-24_H-1024_A-16.npy')
X_train_text = np.load('input/train_X_text_uncased_L-24_H-1024_A-16.npy')

X_test_title = np.load('input/test_X_title_uncased_L-24_H-1024_A-16.npy')
X_test_text = np.load('input/test_X_text_uncased_L-24_H-1024_A-16.npy')

test_fake = pd.read_hdf(path + '/input/test_fake.h5')
train_fake = pd.read_hdf(path + '/input/train_fake.h5')
y = pd.read_csv(path + '/input/y_train.csv')

In [None]:
train_fake = train_fake.fillna('empty')
test_fake = test_fake.fillna('empty')

# EDA

## training

In [None]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

In [None]:
def isnan(value):
    try:
        return math.isnan(float(value))
    except:
        return False

In [None]:
def fun(x):
    if isnan(x)==False:
        return len(x)
    else:
        return 0

In [None]:
train_fake['len_text'] = train_fake['text'].map(lambda x: fun(x))
train_fake['len_title'] = train_fake['title'].map(lambda x: fun(x))

test_fake['len_text'] = test_fake['text'].map(lambda x: fun(x))
test_fake['len_title'] = test_fake['title'].map(lambda x: fun(x))

In [None]:
train_fake['len_text'] = train_fake['text'].map(lambda x: fun(x))
train_fake['len_title'] = train_fake['title'].map(lambda x: fun(x))

test_fake['len_text'] = test_fake['text'].map(lambda x: fun(x))
test_fake['len_title'] = test_fake['title'].map(lambda x: fun(x))

In [None]:
counter(train_fake, 'title_v2', 40)

In [None]:
all_words = ' '.join([text for text in test_fake['title_v2']])
token_phrase = token_space.tokenize(all_words)
frequency = nltk.FreqDist(token_phrase)
df_frequency = pd.DataFrame({"Word": list(frequency.keys()),
                                "Frequency": list(frequency.values())})
df_frequency = df_frequency.nlargest(columns = "Frequency", n = 40)

In [None]:
words = ['COVID-19', 'health-care', 'Syria', 'Aleppo', 'Syrian',
       'elections', 'taxes', 'education', 'killed', 'The', 'immigration',
       'Trump', 'kills', '-', 'Syrias', 'A', 'candidates-biography',
       'civilians', 'kill', 'Damascus', 'Killed', 'economy', 'guns',
       'Monitor', 'attack', 'federal-budget', 'Is', 'President', 'Obama',
       'To', 'New', 'economy,jobs', 'How', 'jobs', 'Civilians',
       'Terrorist', 'Russian', 'coronavirus', 'Can', 'In']

# Model Bert

In [None]:
train_fake = train_fake.fillna('blank')
test_fake = test_fake.fillna('blank')

In [None]:
def init_tokenizer_and_load_bert(model_name='uncased_L-2_H-128_A-2', do_lower_case=True, model_trainable=False):
    model_dir = path + '/{}'.format(model_name)

    config_path = model_dir + '/bert_config.json'
    checkpoint_path = model_dir +'/bert_model.ckpt'
    vocab_path = model_dir + '/vocab.txt'
    
    print("loading: {}".format(model_name))
    
    tokenizer = BertTokenizer(vocab_path)
    print("vocab size: {}".format(len(tokenizer.vocab)))
    
    model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=model_trainable)
    print("loaded: {}".format(model_name))
    
    return tokenizer, model

In [None]:
tokenizer, bert_model = init_tokenizer_and_load_bert(model_name='uncased_L-8_H-512_A-8')

In [None]:
tokenize = lambda sent: tokenizer.encode_plus(sent, max_length=512, padding='max_length', truncation=True)
%time train_fake['tokens_text'] = train_fake['text'].map(tokenize)
%time train_fake['tokens_title'] = train_fake['title'].map(tokenize)

In [None]:
train_fake.sample(5)

In [None]:
train_fake['input_ids'] = train_fake['tokens_text'].map(lambda t: t['input_ids'] )
train_fake['token_type_ids'] = train_fake['tokens_text'].map(lambda t: t['token_type_ids'] )
train_fake['attention_mask'] = train_fake['tokens_text'].map(lambda t: t['attention_mask'] )

In [None]:
train_fake['input_ids_title'] = train_fake['tokens_title'].map(lambda t: t['input_ids'] )
train_fake['token_type_ids_title'] = train_fake['tokens_title'].map(lambda t: t['token_type_ids'] )
train_fake['attention_mask_title'] = train_fake['tokens_title'].map(lambda t: t['attention_mask'] )

In [None]:
input_ids = np.stack(train_fake['input_ids'])
token_type_ids = np.stack(train_fake['token_type_ids'])
attention_mask = np.stack(train_fake['attention_mask'])

input_ids.shape, token_type_ids.shape, attention_mask.shape

In [None]:
input_ids_title = np.stack(train_fake['input_ids_title'])
token_type_ids_title = np.stack(train_fake['token_type_ids_title'])
attention_mask_title = np.stack(train_fake['attention_mask_title'])

input_ids_title.shape, token_type_ids_title.shape, attention_mask_title.shape

In [None]:
%time predicts = bert_model.predict([input_ids, token_type_ids, attention_mask], verbose=1)

In [None]:
%time predicts_title = bert_model.predict([input_ids_title, token_type_ids_title, attention_mask_title], verbose=1)

In [None]:
predicts.shape

In [None]:
X = predicts[:, 0 , :]
X.shape

In [None]:
y.shape

In [None]:
X_train = train_fake

In [None]:
X_train.shape

In [None]:
feats = X_train.columns[1:]
X_train = X_train[feats]

In [None]:
y = y['is_fake']

In [None]:
model = lgb.LGBMClassifier(max_depth=3, n_estimators=30, random_state=0)
%time scores = cross_val_score(model, X_train, y, cv=3, scoring='roc_auc')

np.mean(scores), np.std(scores)

In [None]:
model = xgb.XGBClassifier(max_depth=3, n_estimators=50, random_state=0)
%time scores = cross_val_score(model, X_train, y, cv=3, scoring='roc_auc')

np.mean(scores), np.std(scores)

In [None]:
model = ctb.CatBoostClassifier(max_depth=3, n_estimators=50, verbose=0, random_state=0)
%time scores = cross_val_score(model, X_train, y, cv=3, scoring='roc_auc')

np.mean(scores), np.std(scores)

# Testing

In [None]:
X_train_title

In [None]:
asd = train_fake['len_text'].values

In [None]:
X_train.shape

In [None]:
X_train = np.concatenate((X_train_title, X_train_text), axis = 1)
X_test = np.concatenate((X_test_title, X_test_text), axis = 1)

In [None]:
X_train_df = pd.DataFrame(X_train)

In [None]:
X_train_df['text_len'] = train_fake['len_text']
X_train_df['title_len'] = train_fake['len_title']

In [None]:
y = y['is_fake']

In [None]:
model = lgb.LGBMClassifier(max_depth = 3, n_estimators=20, random_state=0)
%time scores = cross_val_score(model, X_train_df, y, cv=3, scoring='roc_auc')

np.mean(scores), np.std(scores)

In [None]:
model = xgb.XGBClassifier(max_depth=3, n_estimators=40, random_state=0)
%time scores = cross_val_score(model, X_train, y, cv=3, scoring='roc_auc')

np.mean(scores), np.std(scores)

In [None]:
skplt.estimators.plot_learning_curve(model, X_train_df, y, figsize = (7, 4), cv = 3, scoring = 'roc_auc')

In [None]:
feats = test_fake.columns[1:]
test_fake_1 = test_fake[feats]

In [None]:
test_fake_1

In [None]:
model = lgb.LGBMClassifier(max_depth=3, n_estimators=40, random_state=0)
%time scores = cross_val_score(model, X_train, y, cv=3, scoring='roc_auc')

np.mean(scores), np.std(scores)

In [None]:
model.fit(X_train, y)

In [None]:
y_pred = model.predict(test_fake_1)

In [None]:
test_fake['is_fake'] = y_pred

In [None]:
test_fake[ ['id', 'is_fake'] ].to_csv('bert.csv', index=False)

In [None]:
np.unique(y_pred, return_counts=True)

In [None]:
test_fake

# Neural Network

In [None]:
def g(x, word):
    if word in x:
        return 1;
    return 0

In [None]:
for word in words:
    train_fake['title'].map(lambda x: g(x,word))

In [None]:
X_train = np.concatenate((X_train_title, X_train_text), axis = 1)

In [None]:
X_test = np.concatenate((X_test_title, X_test_text), axis = 1)

In [None]:
X_train.shape

In [None]:
model = Sequential([
    Dense(1400, input_dim = 2048, activation='relu'),
    BatchNormalization(),
    Dropout(0.8),


    Dense(600, activation='relu'),
    BatchNormalization(),
    Dropout(0.7),

    Dense(200, activation='relu'),
    BatchNormalization(),
    Dropout(0.7),

    Dense(40, activation='relu'),
    BatchNormalization(),
    Dropout(0.6),

    Dense(2, activation='softmax')
])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC'])
model.summary()

In [None]:
callbacks1 = [ 
    EarlyStopping(monitor = 'loss', patience = 7), 
    ReduceLROnPlateau(monitor = 'loss', patience = 3), 
    ModelCheckpoint('../model.best.hdf5', save_best_only=True) # saving the best model
]

In [None]:
X_1 = X_train[:4000]
X_2 = X_train[4000:]
y_1 = to_categorical(y[:4000])
y_2 = to_categorical(y[4000:])

In [None]:
X_1.shape, y_1.shape

In [None]:
y_train = to_categorical(y)

In [None]:
X_2.shape, y_2.shape

In [None]:
learning_history = model.fit(X_1, y_1,
          batch_size = 128, epochs = 100, verbose = 1,
          callbacks = callbacks1,
          validation_data = (X_2, y_2) )

In [None]:
model = load_model('../model.best.hdf5')

In [None]:
def draw_learning_curve(history, keys=['auc', 'loss']):
    plt.figure(figsize=(20,8))
    for i, key in enumerate(keys):
        plt.subplot(1, 2, i + 1)
        sns.lineplot(x = history.epoch, y = history.history[key])
        sns.lineplot(x = history.epoch, y = history.history['val_' + key])
        plt.title('Learning Curve')
        plt.ylabel(key.title())
        plt.xlabel('Epoch')
#         plt.ylim(ylim)
        plt.legend(['train', 'test'], loc='best')
    plt.show()

In [None]:
draw_learning_curve(learning_history)

In [None]:
model.evaluate(X_2, y_2)

In [None]:
y_pred = model.predict(X_train)

In [None]:
y_pred = np.argmax(y_pred, axis = 1)

In [None]:
df['y_pred'] = y_pred

In [None]:
def isnan(value):
    try:
        return math.isnan(float(value))
    except:
        return False

In [None]:
def isnan(value):
    try:
        return math.isnan(float(value))
    except:
        return False

In [None]:
train_fake['len_text'] = train_fake['text'].map(lambda x: fun(x))
train_fake['len_title'] = train_fake['title'].map(lambda x: fun(x))

test_fake['len_text'] = test_fake['text'].map(lambda x: fun(x))
test_fake['len_title'] = test_fake['title'].map(lambda x: fun(x))

In [None]:
train_fake.sample(50)

In [None]:
train_fake

In [None]:
def get_df_topn(text):
    top_tokens = [ x for x in words ]
    
    def check_top_n(sent_tokens):
        return [int(token in sent_tokens) for token in top_tokens]

    df_topn = text.str.split(" ").map(set).map(check_top_n).apply(pd.Series)
    df_topn.columns = top_tokens

    return df_topn

In [None]:
df = get_df_topn(train_fake['title'])

In [None]:
df

In [None]:
y_v2 = train_fake['is_fake']
model_v1 = lgb.LGBMClassifier(max_depth = 3, n_estimators=20, random_state=0)
%time scores = cross_val_score(model_v1, df, y_v2, cv=3, scoring='roc_auc')

np.mean(scores), np.std(scores)

In [None]:
skplt.estimators.plot_learning_curve(model_v1, df, y_v2, figsize = (7, 4), cv = 3, scoring = 'roc_auc')

In [None]:
model_v1.fit(df, y_v2)

# Test_fake

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred = np.argmax(y_pred, axis = 1)

In [None]:
df = get_df_topn(test_fake['title'])
df['y_pred'] = y_pred

In [None]:
df

In [None]:
y_test = model_v1.predict(df)
test_fake['is_fake'] = y_test

In [None]:
test_fake[ ['id', 'is_fake'] ].to_csv('bert.csv', index=False)

In [None]:
np.unique(y_pred, return_counts=True)

In [None]:
test_fake[ ['id', 'is_fake'] ]

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_pred_X_2 = model.predict(X_2)
y_pred_X_2 = np.argmax(y_pred_X_2, axis = 1)

In [None]:
y_2_pred = np.argmax(y_2, axis = 1)

In [None]:
y_2_pred

In [None]:
confusion_matrix(y_pred_X_2, y_2_pred)