# Fake News Detection: A social media approach

## Package set up

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
user = "pablo" #"jeff"

# set up
pip_install_lib = True

# Preprocessing
preprocess = False
delete_networks = False # Only required when no pre-processing is done

# Word2Vec embedding
load_word2vec_model = True
word2vec_type = 'skipgram' # skipgram or cbow
load_tokenised_inputs = True

# Transformers
load_roberta = True
load_distilbert = True

# Doc2Vec embedding

In [None]:
# set paths
import os

if user == "jeff":
  path = r"/content/drive/MyDrive/Colab Notebooks/cse7643 group project/Final Project"
  module_path = r"/content/drive/MyDrive/Colab Notebooks/cse7643 group project/Final Project/utils/"
  model_path = r"/content/drive/MyDrive/Colab Notebooks/cse7643 group project/Final Project/models/"
  data_path = r"/content/drive/MyDrive/Colab Notebooks/cse7643 group project/Final Project/data/"
else:
  path = r"/content/drive/MyDrive/Gatech/Deep Learning/Final Project"
  module_path = r"/content/drive/MyDrive/Gatech/Deep Learning/Final Project/utils/"
  model_path = r"/content/drive/MyDrive/Gatech/Deep Learning/Final Project/models/"

os.chdir(path)

In [None]:
%%capture
if pip_install_lib:
  ! pip install 'fsspec>=0.3.3'
  ! pip install transformers

In [None]:
%%capture
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import tokenize
from gensim.matutils import cossim
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import sys
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import accuracy_score
import nltk
import re
from nltk.corpus import stopwords
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
nltk.download('stopwords')
nltk.download('wordnet')

from IPython.core.display import HTML # Let's center our outputs
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
.dataframe {
    margin-left: auto !important;
    margin-right: auto !important;
}

</style>
""")

if module_path not in sys.path:
    sys.path.append(module_path)
if model_path not in sys.path:
    sys.path.append(model_path)

import general
from loader import Loader
from models.BILSTM import BILSTM #BILSTM(input_dim, batch_size)
from models.MLP import MLP #MLP(input_dim)
from models.MLP2 import MLP2 #MLP2(input_dim)
from models.BILSTM2 import BILSTM2 #BILSTM2(input_dim, batch_size)
from models.EMB_BILSTM import EMB_BILSTM #EMB_BILSTM(batch_size, weights)
from models.EMB_CNN import EMB_CNN #EMB_CNN(weights)
from models.DistilBert import DistilBert
from models.Roberta import roberta

%matplotlib inline

In [None]:
%load_ext autoreload
%autoreload 2

## Exploratory Data Analysis

In [None]:
# Step 1: We first read the data, concatenate our fake and true news and tag them as such.
# Additionally, title and text (content) are joined in order to make us of all this inforamtion.
# Once we have generated the required dataframe, we print 5 observations to observe the result.
df = general.read_data()
df.head()

When analyzing the data set, one of our first questions consists in whether our classification problem has data imbalance. For this purpose, we first plot the proportion of Fake and True news, concluding that we should not expect any imbalance related problem during the project.

In [None]:
# We can notice that the data is rather balanced and we should not expect any imbalance related problem
df.label.value_counts().plot.pie(autopct="%.1f%%", ylabel = '', title = "Proportion of fake news", 
                                textprops={'color':"w"})
plt.legend(['Real News','Fake News'])
plt.show()

Another important question we would like to know is: how is our data composed in terms of subjects. We can notice that there is not a clear differentiation between the classes for true and fake news. Thus, we conclude that it is better not to include this information as an independent variable.

In [None]:
count = sns.catplot(x="label", hue="subject",
                data=df, kind="count")

In [None]:
# We notice that there is a considerable amount of news with duplicated content
print(f"{sum( df.duplicated('text') )} observations were duplicates")
# Thus, we can safely remove these observations as they add no information
df = df.drop_duplicates(subset='text', keep='first')

In [None]:
# The following plot shows the amount of fake and real news per day.
general.plot_count(df)

In [None]:
# Finally, when reading some news, we noticed that some of them included the source
# When checking, we noticed that the presence of the word Reuters, alone, could 
# predict approximately 99% of the cases, which is not useful for our intention 
# of creating a fake news detector that could eventually generalize to other datasets.
# https://today.yougov.com/ratings/media/popularity/news-websites/all
proportions = []
networks = ['cnn', 'reuters', 'times', 'bloomberg','bbc','forbes','abc','fox',
            'buzzfeed','cbs','huffington','msn','nbc','cnbc','yahoo','google']

source_dict = {}
for i in networks:
  source_dict[i] = 100*round(df.loc[df["text"].str.contains(i),"label"].mean(),2)

table = pd.DataFrame.from_dict(source_dict, orient="index").reset_index()
table.columns = ['source','proportion of fake news (%)']
table

## Preprocessing

In [None]:
# As previously indicated, we have noticed that both the date and subject are not generalizable
# And therefore we will delete these variables as well as the generated ones.
df = df.drop(['subject','date'], axis = "columns")

In [None]:
# apply preprocessing if specified
if preprocess == True:

  # Stopwords
  stop = np.append( stopwords.words('english') , networks )

  def del_sw(word, stop):
      return "" if word in stop or len(word) <= 3 else word

  def lem(word):
    output = [nltk.WordNetLemmatizer().lemmatize(i) for i in word]
    return output

  df.text = df.text.apply(lambda x: " ".join([del_sw(i,stop) for i in re.split(r'\W+', x)]))
  df.text = df.text.apply(lambda x: " ".join([nltk.WordNetLemmatizer().lemmatize(i) for i in str(x).split()]))

elif delete_networks == True:
  def del_net(word, stop):
    return "" if word in stop else word  
  df.text = df.text.apply(lambda x: " ".join([del_net(i,networks) for i in x.split() ]))


In [None]:
# Finally, we split the data set into a training and a test set
X, y = df["text"], df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=1/9, random_state=42, shuffle=True, stratify=y_train)

y_train, y_valid, y_test = y_train.values, y_valid.values, y_test.values

## Create embeddings for news articles using Word2Vec

### Train word2vec model

In [None]:
if load_word2vec_model == False:
  if word2vec_type == 'skipgram':
    # generate list of tokenized articles
    sent = X_train.apply(lambda x: x.split(" "))
    # build word2vec model
    word2vec_model = Word2Vec(
        sent
        ,min_count=1 # 100
        ,size= 100
        ,workers=8
        ,window =3
        ,sg = 1 # skip-gram
    )
    word2vec_model.save("models/word2vec.model")
  else:
    # generate list of tokenized articles
    sent = X_train.apply(lambda x: x.split(" "))
    # build word2vec model
    word2vec_model = Word2Vec(
        sent
        ,min_count=1 # 100
        ,size= 100
        ,workers=8
        ,window =3
        ,sg = 0 # CBOW
    )
    word2vec_model.save("models/word2vec_cbow.model")

else:
  if word2vec_type == 'skipgram':
    word2vec_model = Word2Vec.load("models/word2vec.model")
  else:
    word2vec_model = Word2Vec.load("models/word2vec_cbow.model")  

In [None]:
words = word2vec_model.wv.index2word
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

### Tokenize inputs

In [None]:
# tokenize inputs
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

if load_tokenised_inputs == False:

  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(X_train)

  X_train = tokenizer.texts_to_sequences(X_train)
  X_valid = tokenizer.texts_to_sequences(X_valid)
  X_test = tokenizer.texts_to_sequences(X_test)

  maxlen = 700

  #Making all news of size maxlen defined above
  X_train = pad_sequences(X_train, maxlen=maxlen)
  X_valid = pad_sequences(X_valid, maxlen=maxlen)
  X_test = pad_sequences(X_test, maxlen=maxlen)

  pickle.dump(X_train, open("data/X_train.pickle", "wb"))
  pickle.dump(X_valid, open("data/X_valid.pickle", "wb"))
  pickle.dump(X_test, open("data/X_test.pickle", "wb"))
  pickle.dump(y_train, open("data/y_train.pickle", "wb"))
  pickle.dump(y_valid, open("data/y_valid.pickle", "wb"))
  pickle.dump(y_test, open("data/y_test.pickle", "wb"))

else:

  # load in embedded document vectors
  X_train = pickle.load(open("data/X_train.pickle","rb"))
  X_valid = pickle.load(open("data/X_valid.pickle","rb"))
  X_test = pickle.load(open("data/X_test.pickle","rb"))
  y_train = pickle.load(open("data/y_train.pickle","rb"))
  y_valid = pickle.load(open("data/y_valid.pickle","rb"))
  y_test = pickle.load(open("data/y_test.pickle","rb"))

### Create embedding

In [None]:
weights = torch.FloatTensor(word2vec_model.wv.vectors)
embedding = nn.Embedding.from_pretrained(weights)

## Networks with Embeddings


### CNN

In [None]:
# convert to tensors
X_train_tensor = torch.tensor(X_train).to(int)
y_train_tensor = torch.tensor(y_train)
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)

X_valid_tensor = torch.tensor(X_valid).to(int)
y_valid_tensor = torch.tensor(y_valid)
valid_dataset = torch.utils.data.TensorDataset(X_valid_tensor, y_valid_tensor)

X_test_tensor = torch.tensor(X_test).to(int)
y_test_tensor = torch.tensor(y_test)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)

In [None]:
# Hyper-parameters
input_dim = int(X_train_tensor.shape[1])
batch_size = 128

# Loader
mloader = Loader(train_dataset, valid_dataset, batch_size)
mloader.train(model = EMB_CNN(weights) , verbose = True)

In [None]:
mloader.eval( test_dataset )

### LSTM model

In [None]:
# convert to tensors
X_train_tensor = torch.tensor(X_train).to(int)
y_train_tensor = torch.tensor(y_train)
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)

X_valid_tensor = torch.tensor(X_valid).to(int)
y_valid_tensor = torch.tensor(y_valid)
valid_dataset = torch.utils.data.TensorDataset(X_valid_tensor, y_valid_tensor)

X_test_tensor = torch.tensor(X_test).to(int)
y_test_tensor = torch.tensor(y_test)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)

In [None]:
# Hyper-parameters
input_dim = int(X_train_tensor.shape[1])
batch_size = 128

# Loader
mloader = Loader(train_dataset, valid_dataset, batch_size)
mloader.train(model = EMB_BILSTM(batch_size, weights) , verbose = True)

In [None]:
mloader.eval( test_dataset )

In [None]:
# class Model(nn.Module):
#     def __init__(self, embedding, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
#         super(Model, self).__init__()
#         self.output_size = output_size
#         self.n_layers = n_layers
#         self.hidden_dim = hidden_dim
        
#         self.embedding = embedding
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
#         self.dropout = nn.Dropout(drop_prob)
#         self.fc = nn.Linear(hidden_dim, output_size)
#         self.sigmoid = nn.Sigmoid()
        
#     def forward(self, x, hidden):
#         batch_size = x.size(0)
#         x = x.long()
#         embeds = self.embedding(x)
#         lstm_out, hidden = self.lstm(embeds, hidden)
#         lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
#         out = self.dropout(lstm_out)
#         out = self.fc(out)
#         out = self.sigmoid(out)
        
#         out = out.view(batch_size, -1)
#         out = out[:,-1]
#         return out, hidden
    
#     def init_hidden(self, batch_size):
#         weight = next(self.parameters()).data
#         hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
#                       weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
#         return hidden

In [None]:
# vocab_size = len(word2idx) + 1
# output_size = 1
# embedding_dim = 100
# hidden_dim = 512
# n_layers = 2
# model = Model(embedding, vocab_size, output_size, embedding_dim, hidden_dim, n_layers).to(device)


# optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
# loss_func = nn.BCELoss()

# num_epoch = 10

# # start training
# # lists for storing loss at each epoch
# best_val_loss = 99999999 # store best validation loss
# epoch_list = [] # list of epochs (for plotting later)
# train_loss_list = [] # training loss at each epoch
# val_loss_list = [] # validation loss at each epoch

# for epoch in range(num_epoch): # loop through epoc
#     model.train() # set model to training mode
#     batch_losses = []

#     h = model.init_hidden(batch_size)

#     for batch_x, batch_y in train_loader: # for each training step
        
#         # get batch of data
#         train = torch.autograd.Variable(batch_x).float().to(device)
#         label = torch.autograd.Variable(batch_y).float().to(device)

#         # h = tuple([e.data for e in h])
#         prediction = model(train, h).to(device) # forward propogration
#         label = torch.nn.functional.one_hot(label.to(torch.int64), 2)
#         loss = loss_func(prediction, label.float().detach()) # calculate loss
#         loss.backward() # calculate gradients
#         optimizer.step() # update parameters based on caluclated gradients
#         optimizer.zero_grad() # clear gradients for next train

#         batch_losses.append(loss.item()) # record loss for this mini batch

#     # record average mini batch loss over epoch
#     training_loss = np.mean(batch_losses)
#     train_loss_list.append(training_loss)

#     with torch.no_grad(): # set all requires_grad to false
#         val_losses = []
#         for batch_x, batch_y in valid_loader:
#             model.eval() # ensure batchnorm and dropout work properly in evaluation mode

#             batch_x = batch_x.float().to(device)
#             batch_y = batch_y.float().to(device)

#             yhat = model(batch_x).to(device)

#             # record loss for this mini batch
#             batch_y = torch.nn.functional.one_hot(batch_y.to(torch.int64), 2)
#             val_loss = loss_func(yhat, batch_y.float().detach()).item()
#             val_losses.append(val_loss)
        
#         # record average mini batch loss for validation
#         validation_loss = np.mean(val_losses)
#         if validation_loss < best_val_loss: best_val_loss = validation_loss # record best validation loss
#         val_loss_list.append(validation_loss)

            
#     print(f"[{epoch+1}] Training loss: {training_loss:.5f}\t Validation loss: {validation_loss:.5f}\t Best Validation loss: {best_val_loss:.5f}")
#     epoch_list.append(epoch)

### Roberta

In [None]:
# Finally, we split the data set into a training and a test set
X, y = df["text"], df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=1/9, random_state=42, shuffle=True, stratify=y_train)

y_train, y_valid, y_test = y_train.values, y_valid.values, y_test.values

In [None]:
%%capture

if load_roberta == False:
  tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
  X_train_tensor = general.autok(X_train, tokenizer)
  y_train_tensor = torch.tensor(y_train)
  train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
  X_valid_tensor = general.autok(X_valid, tokenizer)
  y_valid_tensor = torch.tensor(y_valid)
  valid_dataset = torch.utils.data.TensorDataset(X_valid_tensor, y_valid_tensor)
  X_test_tensor = general.autok(X_test, tokenizer)
  y_test_tensor = torch.tensor(y_test)
  test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
  torch.save(X_train_tensor, 'data/transformers/Rob_X_train_tensor.pt')
  torch.save(y_train_tensor, 'data/transformers/Rob_y_train_tensor.pt')
  torch.save(X_valid_tensor, 'data/transformers/Rob_X_valid_tensor.pt')
  torch.save(y_valid_tensor, 'data/transformers/Rob_y_valid_tensor.pt')
  torch.save(X_test_tensor, 'data/transformers/Rob_X_test_tensor.pt')
  torch.save(y_test_tensor, 'data/transformers/Rob_y_test_tensor.pt')
else:
  X_train_tensor = torch.load('data/transformers/Rob_X_train_tensor.pt')
  y_train_tensor = torch.load('data/transformers/Rob_y_train_tensor.pt')
  X_valid_tensor = torch.load('data/transformers/Rob_X_valid_tensor.pt')
  y_valid_tensor = torch.load('data/transformers/Rob_y_valid_tensor.pt')
  X_test_tensor = torch.load('data/transformers/Rob_X_test_tensor.pt')
  y_test_tensor = torch.load('data/transformers/Rob_y_test_tensor.pt')
  train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
  valid_dataset = torch.utils.data.TensorDataset(X_valid_tensor, y_valid_tensor)
  test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)


In [None]:
# Loader
mloader = Loader(train_dataset, valid_dataset, batch_size = 128)
mloader.train(model = roberta() , verbose = True)

In [None]:
mloader.eval( test_dataset )

### Distilbert

In [None]:
# Finally, we split the data set into a training and a test set
X, y = df["text"], df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=1/9, random_state=42, shuffle=True, stratify=y_train)

y_train, y_valid, y_test = y_train.values, y_valid.values, y_test.values

In [None]:
%%capture

if load_distilbert == False:
  tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
  X_train_tensor = general.autok(X_train, tokenizer)
  y_train_tensor = torch.tensor(y_train)
  train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
  X_valid_tensor = general.autok(X_valid, tokenizer)
  y_valid_tensor = torch.tensor(y_valid)
  valid_dataset = torch.utils.data.TensorDataset(X_valid_tensor, y_valid_tensor)
  X_test_tensor = general.autok(X_test, tokenizer)
  y_test_tensor = torch.tensor(y_test)
  test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
  torch.save(X_train_tensor, 'data/transformers/DB_X_train_tensor.pt')
  torch.save(y_train_tensor, 'data/transformers/DB_y_train_tensor.pt')
  torch.save(X_valid_tensor, 'data/transformers/DB_X_valid_tensor.pt')
  torch.save(y_valid_tensor, 'data/transformers/DB_y_valid_tensor.pt')
  torch.save(X_test_tensor, 'data/transformers/DB_X_test_tensor.pt')
  torch.save(y_test_tensor, 'data/transformers/DB_y_test_tensor.pt')
else:
  X_train_tensor = torch.load('data/transformers/DB_X_train_tensor.pt')
  y_train_tensor = torch.load('data/transformers/DB_y_train_tensor.pt')
  X_valid_tensor = torch.load('data/transformers/DB_X_valid_tensor.pt')
  y_valid_tensor = torch.load('data/transformers/DB_y_valid_tensor.pt')
  X_test_tensor = torch.load('data/transformers/DB_X_test_tensor.pt')
  y_test_tensor = torch.load('data/transformers/DB_y_test_tensor.pt')
  train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
  valid_dataset = torch.utils.data.TensorDataset(X_valid_tensor, y_valid_tensor)
  test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)


In [None]:
# Loader
mloader = Loader(train_dataset, valid_dataset, batch_size = 128)
mloader.train(model = DistilBert() , verbose = True)

In [None]:
mloader.eval( test_dataset )

## Create embeddings for news articles using doc2vec

In [None]:
# create a column that has a unique id for each document - doc2vec needs a tag
X_train_index = X_train.reset_index(drop=True).reset_index()

# tag and tokenise documents for doc2vec model input
X_train_tagged = general.get_tagged_documents(X_train_index, 'text', ['index'])

# generate doc2vec model
doc2vec_model = general.generate_doc2vec_model(
    X_train_tagged
    ,method="dbow"
    ,max_epochs=10
    ,vec_size=50
    ,alpha=0.025
    ,min_count=50
)

# infer vectors using trained model
X_train_vector = general.get_vector_representations(doc2vec_model, X_train)
X_valid_vector = general.get_vector_representations(doc2vec_model, X_valid)
X_test_vector = general.get_vector_representations(doc2vec_model, X_test)

In [None]:
# # pickle the files so we can load it in later
# os.chdir(data_path)
# pickle.dump(X_train_vector, open("X_train_vector_stopwords_True_doc2vec_PVDM_dim_50.pickle", "wb"))
# pickle.dump(X_valid_vector, open("X_valid_vector_stopwords_True_doc2vec_PVDM_dim_50.pickle", "wb"))
# pickle.dump(X_test_vector, open("X_test_vector_stopwords_True_doc2vec_PVDM_dim_50.pickle", "wb"))

In [None]:
# # pickle the files so we can load it in later
# os.chdir(data_path)
# pickle.dump(X_train_vector, open("X_train_vector_stopwords_True_doc2vec_DBOW_dim_50.pickle", "wb"))
# pickle.dump(X_valid_vector, open("X_valid_vector_stopwords_True_doc2vec_DBOW_dim_50.pickle", "wb"))
# pickle.dump(X_test_vector, open("X_test_vector_stopwords_True_doc2vec_DBOW_dim_50.pickle", "wb"))

In [None]:
# # pickle the files so we can load it in later
# os.chdir(data_path)
# pickle.dump(X_train_vector, open("X_train_vector_stopwords_True_doc2vec_DBOW_dim_100.pickle", "wb"))
# pickle.dump(X_valid_vector, open("X_valid_vector_stopwords_True_doc2vec_DBOW_dim_100.pickle", "wb"))
# pickle.dump(X_test_vector, open("X_test_vector_stopwords_True_doc2vec_DBOW_dim_100.pickle", "wb"))

In [None]:
# # pickle the files so we can load it in later
# os.chdir(data_path)
# pickle.dump(X_train_vector, open("X_train_vector_stopwords_False_doc2vec_DBOW_dim_100.pickle", "wb"))
# pickle.dump(X_valid_vector, open("X_valid_vector_stopwords_False_doc2vec_DBOW_dim_100.pickle", "wb"))
# pickle.dump(X_test_vector, open("X_test_vector_stopwords_False_doc2vec_DBOW_dim_100.pickle", "wb"))

## MLP

In [None]:
# load in embedded document vectors
# refer to data folder for pickle names
X_train_vector_pickle = "data/X_train_vector_stopwords_False_doc2vec_DBOW_dim_100.pickle"
X_valid_vector_pickle = "data/X_valid_vector_stopwords_False_doc2vec_DBOW_dim_100.pickle"
X_test_vector_pickle = "data/X_test_vector_stopwords_False_doc2vec_DBOW_dim_100.pickle"

X_train_vector = pickle.load(open(X_train_vector_pickle,"rb"))
X_valid_vector = pickle.load(open(X_valid_vector_pickle,"rb"))
X_test_vector = pickle.load(open(X_test_vector_pickle,"rb"))

In [None]:
# convert to tensors
X_train_tensor = torch.tensor(X_train_vector.values).to(float)
y_train_tensor = torch.tensor(y_train)
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)

X_valid_tensor = torch.tensor(X_valid_vector.values).to(float)
y_valid_tensor = torch.tensor(y_valid)
valid_dataset = torch.utils.data.TensorDataset(X_valid_tensor, y_valid_tensor)

X_test_tensor = torch.tensor(X_test_vector.values).to(float)
y_test_tensor = torch.tensor(y_test)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)

In [None]:
# Hyper-parameters
input_dim = int(X_train_tensor.shape[1])
batch_size = 128

In [None]:
# Hyper-parameters
input_dim = int(X_train_tensor.shape[1])
batch_size = 128

# Loader
mloader = Loader(train_dataset, valid_dataset, batch_size)
mloader.train(model = MLP2(input_dim) , verbose = True)

In [None]:
mloader.eval( test_dataset )

In [None]:
plt.plot(epoch_list, val_loss_list, label="validation")
plt.plot(epoch_list, train_loss_list, label="train")
plt.ylim(bottom=0)
plt.xlabel("Number of epochs")
plt.ylabel("MSE")
plt.title("MLP: BCELoss vs Number of iterations")
plt.legend();

***