In [1]:
import os
parentdir = "/Users/sude_umac/PycharmProjects/NLP2/Propaganda_dataset "
train_file= "propaganda_train.tsv"
test_file= "propaganda_val.tsv"
train_path=os.path.join(parentdir,train_file)
test_path= os.path.join(parentdir,test_file) 

In [2]:
import pandas as pd

train_df = pd.read_csv(train_path,delimiter="\t",quotechar='|')
train_df.columns = ['label', 'sentence']

# Load the testing data
test_df = pd.read_csv(test_path,delimiter="\t",quotechar='|')
test_df.columns = ['label', 'sentence']

In [3]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Preprocess the data
le = LabelEncoder()
vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(train_df['sentence']).toarray()
y_train = le.fit_transform(train_df['label'])

X_test = vectorizer.transform(test_df['sentence']).toarray()
y_test = le.transform(test_df['label'])

# Create DataLoaders
class TextDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_data = TextDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train))
test_data = TextDataset(torch.FloatTensor(X_test), torch.LongTensor(y_test))

train_loader = DataLoader(dataset=train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

# Define the LSTM model
class TextLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        rnn_out, _ = self.rnn(text.unsqueeze(1))
        rnn_out = rnn_out[:, -1, :]
        return self.fc(rnn_out)

# Initialize the model, loss function and optimizer
model = TextLSTM(X_train.shape[1], 50, len(le.classes_))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train the model
for epoch in range(10):  # loop over the dataset multiple times
    for i, (texts, labels) in enumerate(train_loader):
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test data: %d %%' % (100 * correct / total))

Accuracy of the network on the test data: 47 %


In [9]:
len(vectorizer.vocabulary_)

8610

In [10]:
from sklearn.model_selection import GridSearchCV
from skorch import NeuralNetClassifier

# Define the LSTM model
class TextLSTM(nn.Module):
    def __init__(self, input_dim= len(vectorizer.vocabulary_), hidden_dim=50, output_dim=0):
        super(TextLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        rnn_out, _ = self.rnn(text.unsqueeze(1))
        rnn_out = rnn_out[:, -1, :]
        return self.fc(rnn_out)

model = TextLSTM(X_train.shape[1], 50, len(le.classes_))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(10):
    for i, (texts, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

correct = 0
total = 0
with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test data: %d %%' % (100 * correct / total))

net = NeuralNetClassifier(
    module=TextLSTM,
    module__input_dim=X_train.shape[1],
    module__output_dim=len(le.classes_),
    max_epochs=10,
    lr=0.01,
    optimizer=torch.optim.Adam,
    criterion=nn.CrossEntropyLoss

)

params = {
    'lr': [0.01, 0.001, 0.0001],
    'max_epochs': [10, 20, 30],
    'module__hidden_dim': [50, 100, 150],
}

gs = GridSearchCV(net, params, refit=False, cv=3, scoring='accuracy')

gs.fit(X_train.astype('float32'), y_train)

print("Best score: ", gs.best_score_)
print("Best parameters: ", gs.best_params_)

Accuracy of the network on the test data: 45 %
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.9318[0m       [32m0.4942[0m        [35m1.7710[0m  0.1324
      2        [36m1.4360[0m       [32m0.4971[0m        [35m1.6389[0m  0.1099
      3        [36m0.8765[0m       [32m0.5088[0m        1.6608  0.0983
      4        [36m0.2870[0m       0.5058        1.9319  0.0893
      5        [36m0.0980[0m       0.4766        2.1956  0.0834
      6        [36m0.0553[0m       0.4766        2.4102  0.0832
      7        [36m0.0403[0m       0.4708        2.4869  0.0815
      8        [36m0.0342[0m       0.4649        2.5498  0.0771
      9        [36m0.0312[0m       0.4678        2.6087  0.0872
     10        [36m0.0293[0m       0.4708        2.6530  0.0809
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m