# Long Short Term Memory Neural Network for binary classification of messages

We have selectied LSTM Neural network as our best model for our message classification model because it is well suited for memorising previous messages and their links between them 

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset


def preprocess_sentence(text):
    text = re.sub(r'<[^>]+>', ' ', text) # Remove content within <>
    text = re.sub(r'\s+', ' ', text)     # Replace multiple spaces with a single space
    text = text.strip()                  # Remove leading and trailing spaces
    text = text.lower()                  # Convert to lowercase to maintain consistency
    return text


# Load data
train_transcriptions_df = pd.read_csv('train_transcriptions_df.csv')
train_labels_df = pd.read_csv('training_labels_df.csv')
test_transcriptions_df = pd.read_csv('test_transcriptions_df.csv')
train_correspondances_df = pd.read_csv('train_correspondances_df.csv')




print("Preprocessing sentences...")

train_transcriptions_df['clean_text'] = train_transcriptions_df['text'].apply(preprocess_sentence)
test_transcriptions_df['clean_text'] = test_transcriptions_df['text'].apply(preprocess_sentence)

Preprocessing sentences...


## Embeedings using BERT

In [2]:
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

sentences_embeeded = bert.encode(train_transcriptions_df['clean_text'].tolist())

  from .autonotebook import tqdm as notebook_tqdm


## Add previous links using indicative vectors

In [3]:
links = train_correspondances_df['type'].unique()

vector_dict = {}
base_vector = np.zeros(len(links))
for i,link in enumerate(links):
    link_vector = base_vector.copy()
    link_vector[i] = 1
    vector_dict[link] = link_vector

In [4]:
lst_links_vectors = []
# iter over all rows of train_transcriptions_df
from tqdm import tqdm
for index, row in train_transcriptions_df.iterrows():
    correspondances = train_correspondances_df[(train_correspondances_df['transcription_id'] == row['transcription_id'])&(train_correspondances_df['2'] == row['index'])]
    link_vector = np.zeros(len(links))
    for type_of_link in correspondances['type']:
        link_vector += vector_dict[type_of_link]
    lst_links_vectors.append(link_vector)
lst_links_vectors = np.array(lst_links_vectors)

## Sentiment score

In [5]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')


# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to calculate sentiment scores for each text
def get_sentiment_scores(text):
    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores

# Get the sentiment scores for each text
sentiment_scores = train_transcriptions_df['clean_text'].apply(get_sentiment_scores)
# transform to array
sentiment_scores.to_numpy()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mathiasgrau/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


array([{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.2263},
       {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
       {'neg': 0.0, 'neu': 0.826, 'pos': 0.174, 'compound': 0.2732}, ...,
       {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4019},
       {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
       {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.296}],
      dtype=object)

In [6]:
sentiment_vectors = []
for sentiment_score in sentiment_scores:
    sentiment_vector = np.array([sentiment_score['neg'], sentiment_score['neu'], sentiment_score['pos'], sentiment_score['compound']])
    sentiment_vectors.append(sentiment_vector)
sentiment_vectors = np.array(sentiment_vectors)

## Concatenation of all features

In [7]:
combined_features = [np.concatenate([embedding, sentiment, link])
                     for embedding, sentiment, link in zip(sentences_embeeded, sentiment_vectors, lst_links_vectors)]

combined_features = np.array(combined_features)
combined_features.shape

(72623, 404)

## Data Preparation 

In [8]:
X_train = []
y_train = []

for i in range(4, len(combined_features) - 2):
    X_train.append(combined_features[i-4:i+3])
    y_train.append(train_labels_df['label'].to_numpy()[i])

X_train = np.array(X_train)
y_train = np.array(y_train)

print(X_train.shape)
print(y_train.shape)

(72617, 7, 404)
(72617,)


### Conversion to Tensors

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).float().unsqueeze(1)

X_test_tensor = torch.from_numpy(X_test).float()
y_test_tensor = torch.from_numpy(y_test).float().unsqueeze(1)

# Create the dataloaders
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32)

test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

### Long Short Term Memory neural Network

In [10]:


class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initialize hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])

        # Apply sigmoid activation
        return torch.sigmoid(out)



### Train and Evaluate the Model

In [11]:

from sklearn.metrics import f1_score

def get_best_threshold(y_1,y_2):
    best_score = 0.
    best_thresold = 0.
    for threshold in np.arange(0.1, 0.5, 0.01):
        score = f1_score(y_1, y_2 >= threshold)
        if score > best_score:
            best_score = score
            best_thresold = threshold
    return best_thresold, best_score
        
from sklearn.metrics import f1_score
# Create the model instance
model = LSTMClassifier(input_size=404, hidden_size=1000, num_layers=1, output_size=1)

# Define the loss function and the optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

# Train the model
epochs = 2


for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 100 == 99:
            running_loss = 0.0
    scheduler.step()
    model.eval()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            outputs = model(inputs)
            y_pred.extend(outputs.squeeze(1).tolist())
            y_true.extend(labels.squeeze(1).tolist())
    
    threshold, score = get_best_threshold(y_true, y_pred)
    print("f1 score: ", score, "for epoch: ", epoch + 1, "with threshold: ", threshold)


f1 score:  0.5845970859351769 for epoch:  1 with threshold:  0.2799999999999999
f1 score:  0.5922509225092252 for epoch:  2 with threshold:  0.2799999999999999


In [12]:
y_pred = []
y_true = []

with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        outputs = model(inputs)
        y_pred.extend(outputs.squeeze(1).tolist())
        y_true.extend(labels.squeeze(1).tolist())

In [13]:
threshold, score = get_best_threshold(y_true, y_pred)
y_pred_nn = np.array(y_pred) >= threshold
y_true = np.array(y_true)

print(classification_report(y_true, y_pred_nn))

              precision    recall  f1-score   support

         0.0       0.93      0.84      0.88     11835
         1.0       0.50      0.72      0.59      2689

    accuracy                           0.82     14524
   macro avg       0.72      0.78      0.74     14524
weighted avg       0.85      0.82      0.83     14524

