## LSTM Models Training

In [4]:
import pandas as pd

from config import *

In [None]:
df = pd.read_csv('WELFake_Dataset_processed.tsv', sep='\t')
df.head(2)

In [6]:
from sklearn.model_selection import train_test_split

train_texts, temp_text, train_labels, temp_labels = train_test_split(
  df['full_text_processed'],
  df['label'],
  random_state=2018,
  test_size=0.4,
  stratify=df['label']
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
  temp_text,
  temp_labels,
  random_state=2018,
  test_size=0.5,
  stratify=temp_labels
)

### Gensim Word2Vec initialization

In [7]:
import gensim
from Training.utils import makeWords, train_evaluate_and_test_models, visualize_results

mod = gensim.models.Word2Vec(sentences=makeWords(df['full_text_processed']), vector_size=EMBEDDING_DIM, window=5, min_count=1, workers=5)

In [None]:
text_lengths = train_texts.apply(lambda x: len(str(x).split()))

import matplotlib.pyplot as plt
import numpy as np

plt.hist(text_lengths, bins=50, color='blue', alpha=0.7)
plt.xlabel("Text Length")
plt.ylabel("Frequency")
plt.title("Distribution of Text Lengths")
plt.show()

print(f"Mean Length: {np.mean(text_lengths):.2f}")
print(f"Median Length: {np.median(text_lengths):.2f}")
print(f"90th Percentile: {np.percentile(text_lengths, 90):.2f}")
print(f"Maximum Length: {np.max(text_lengths):.2f}")

In [9]:
from NewsDatasetLSTM import NewsDatasetLSTM
from torch.utils.data import DataLoader

max_len = 623 
# Instantiate datasets
train_dataset = NewsDatasetLSTM(train_texts, train_labels, mod, max_len=max_len)
val_dataset = NewsDatasetLSTM(val_texts, val_labels, mod, max_len=max_len)
test_dataset = NewsDatasetLSTM(test_texts, test_labels, mod, max_len=max_len)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size,drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
num_negatives = len(df[df['label'] == 0])
num_positives = len(df[df['label'] == 1])
class_counts = [num_negatives, num_positives]

results = train_evaluate_and_test_models(class_counts,train_loader, val_loader, test_loader, epochs=10)