# Classic Neural Network for Binary Classification

In [6]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Load data
train_transcriptions_df = pd.read_csv('train_transcriptions_df.csv')
train_labels_df = pd.read_csv('training_labels_df.csv')
test_transcriptions_df = pd.read_csv('test_transcriptions_df.csv')

import re

def preprocess_sentence(text):
    text = re.sub(r'<[^>]+>', ' ', text) # Remove content within <>
    text = re.sub(r'\s+', ' ', text)     # Replace multiple spaces with a single space
    text = text.strip()                  # Remove leading and trailing spaces
    test = text.lower()                  # Convert to lowercase to maintain consistency
    return text

print("Preprocessing sentences...")

train_transcriptions_df['clean_text'] = train_transcriptions_df['text'].apply(preprocess_sentence)
test_transcriptions_df['clean_text'] = test_transcriptions_df['text'].apply(preprocess_sentence)

Preprocessing sentences...


## Embeeding of sentences

In [7]:
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

sentences_embeeded = bert.encode(train_transcriptions_df['clean_text'].tolist())

## Creation of test and train datasets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(sentences_embeeded, train_labels_df['label'].to_numpy(), test_size=0.2, random_state=42)

In [9]:
## neural network

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        # Input to first hidden layer
        self.layer1 = nn.Linear(input_size, 1000) 
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)
        
        # Second hidden layer
        self.layer2 = nn.Linear(1000, 1000)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        
        # Output layer
        self.output_layer = nn.Linear(1000, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Forward pass through the network
        x = self.layer1(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.layer2(x)
        x = self.relu2(x)
        x = self.dropout2(x)

        x = self.output_layer(x)
        x = self.sigmoid(x)
        return x
    

# Create the model

model = NeuralNetwork(X_train.shape[1])

# Define the loss function and the optimizer

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Convert the data into PyTorch tensors

X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).float().unsqueeze(1)

X_test_tensor = torch.from_numpy(X_test).float()
y_test_tensor = torch.from_numpy(y_test).float().unsqueeze(1)

# Create the dataloaders

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32)

# Train the model

epochs = 3

for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        # Get the inputs
        inputs, labels = data

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 100 == 99:
            print(f'Epoch: {epoch + 1}, Batch: {i + 1}, Loss: {running_loss / 100}')
            running_loss = 0.0
    scheduler.step()

# Evaluate the model

y_pred = []
y_true = []

with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        outputs = model(inputs)
        y_pred.extend(outputs.squeeze(1).tolist())
        y_true.extend(labels.squeeze(1).tolist())


Epoch: 1, Batch: 100, Loss: 0.4692472831904888
Epoch: 1, Batch: 200, Loss: 0.3612713104486465
Epoch: 1, Batch: 300, Loss: 0.35524294659495353
Epoch: 1, Batch: 400, Loss: 0.3487639807164669
Epoch: 1, Batch: 500, Loss: 0.3465275876224041
Epoch: 1, Batch: 600, Loss: 0.33797567665576933
Epoch: 1, Batch: 700, Loss: 0.3400791721045971
Epoch: 1, Batch: 800, Loss: 0.3481330679357052
Epoch: 1, Batch: 900, Loss: 0.3158370053768158
Epoch: 1, Batch: 1000, Loss: 0.31555160999298093
Epoch: 1, Batch: 1100, Loss: 0.3148860946297646
Epoch: 1, Batch: 1200, Loss: 0.3270181677490473
Epoch: 1, Batch: 1300, Loss: 0.32705342963337897
Epoch: 1, Batch: 1400, Loss: 0.33480315923690795
Epoch: 1, Batch: 1500, Loss: 0.3198452538251877
Epoch: 1, Batch: 1600, Loss: 0.31667259708046913
Epoch: 1, Batch: 1700, Loss: 0.326815539598465
Epoch: 1, Batch: 1800, Loss: 0.33223134562373163
Epoch: 2, Batch: 100, Loss: 0.3100523456931114
Epoch: 2, Batch: 200, Loss: 0.3091162486374378
Epoch: 2, Batch: 300, Loss: 0.333646491914987

## Evaluation of the prediction

In [10]:
y_pred_nn = np.array(y_pred) >= 0.29
y_true = np.array(y_true)

print(classification_report(y_true, y_pred_nn))

              precision    recall  f1-score   support

         0.0       0.94      0.82      0.87     11887
         1.0       0.47      0.75      0.58      2638

    accuracy                           0.80     14525
   macro avg       0.70      0.78      0.73     14525
weighted avg       0.85      0.80      0.82     14525

