In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd

# Plotting the confusion matrix
def plot_confusion_matrix(cm, classes):
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()

In [5]:
file_path = os.path.join(os.curdir,'data','Kopie van BastianLewisData.xlsx')

df_vars = pd.read_excel(file_path,sheet_name='VariableSelection')
df = pd.read_excel(file_path,sheet_name='Result 1')

var_drop = list(df_vars['Name'][df_vars['Status'] == 'Drop'])

df.drop(labels = var_drop,  axis= 1, inplace = True)

df["Huidig/Laatste functie"].fillna("", inplace=True)
df['Meest trotste project'].fillna("", inplace=True)

df['text'] = df["Huidig/Laatste functie"] +" : " + df['Meest trotste project']

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data, datasets
from sklearn.model_selection import train_test_split

# Step 1: Text Preprocessing
df['processed_text'] = df['text'].str.lower().str.replace('[^\w\s]', '')

# Step 2: Split Data
train_data, valid_data = train_test_split(df, test_size=0.2, random_state=42)

# Define Fields
TEXT = data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm', lower=True)
LABEL = data.LabelField(dtype=torch.float)

# Create Dataset
fields = [('text', TEXT), ('label', LABEL)]
train_examples = [data.Example.fromlist(i, fields) for i in train_data.values.tolist()]
valid_examples = [data.Example.fromlist(i, fields) for i in valid_data.values.tolist()]

train_dataset = data.Dataset(train_examples, fields)
valid_dataset = data.Dataset(valid_examples, fields)

# Build Vocabulary
TEXT.build_vocab(train_dataset, max_size=10000)
LABEL.build_vocab(train_dataset)

# Create Iterators
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_dataset, valid_dataset),
    batch_size=64,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

# Step 3: Define the Model
class SimpleNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        pooled = torch.mean(embedded, dim=1)
        return self.fc(pooled)

# Initialize Model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1

model = SimpleNN(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

# Step 4: Training the Model
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

# Training Loop
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for batch in train_iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
    print(f'Epoch {epoch+1}, Train Loss: {epoch_loss / len(train_iterator)}')

# Step 5: Evaluation
model.eval()
with torch.no_grad():
    valid_loss = 0
    for batch in valid_iterator:
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        valid_loss += loss.item()
    print(f'Validation Loss: {valid_loss / len(valid_iterator)}')
