In [11]:
import numpy as np
import os
import wandb
from datasets import load_dataset
from transformers import AutoTokenizer
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
# SWDA dataset
import pandas as pd
# Chemin du répertoire contenant les dossiers de données
chemin_dossier = "Data/swda"

fichier_train = os.path.join(chemin_dossier, "train.csv")
fichier_test = os.path.join(chemin_dossier, "test.csv")
train_dataset = pd.read_csv(fichier_train, delimiter='\t')
test_dataset = pd.read_csv(fichier_test, delimiter='\t')

# Extract the utterances and labels
train_X = train_dataset["Utterance"].tolist()
train_y = train_dataset["Dialogue_Act"]

test_X = test_dataset["Utterance"].tolist()
test_y = test_dataset["Dialogue_Act"]

# Map the labels to integers using a dictionary

# Create a dictionary to map the labels to integers
label2int = {}
for i, label in enumerate(set(train_y)):
    label2int[label] = i

# Map the labels to integers
train_y = [label2int[label] for label in train_y]
test_y = [label2int[label] for label in test_y]

In [13]:
train_X[:5], train_y[:5]

(["so i 've been concerned about crime lately .",
  'uh-huh .',
  "uh , it 's really scary to listen to the news every night and --",
  'uh-huh .',
  '-- to hear about all the problems .'],
 [18, 17, 0, 17, 19])

In [14]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_X)

# Convert the text to sequences

# Convert text data to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_X)
test_sequences = tokenizer.texts_to_sequences(test_X)

In [15]:
# Pad sequences to have the same length
max_sequence_length = max(len(seq) for seq in train_sequences + test_sequences)
train_data_ = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
test_data_ = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

# Convert array to list
train_data = train_data_.tolist()
test_data = test_data_.tolist()

In [16]:
"""
Random Forest Classifier
"""
# Train a random forest classifier
rfc = RandomForestClassifier()
rfc.fit(train_data, train_y)

# Evaluate on test set
test_preds = rfc.predict(test_data)
test_acc = accuracy_score(test_y, test_preds)
print(f"Test accuracy: {test_acc:.3f}")

Test accuracy: 0.615


In [17]:
# Plot the confusion matrix of the 12 labels
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# y_pred = model.predict(test_dataset.batch(batch_size))
y_pred = test_preds
y_pred = np.argmax(y_pred, axis=1)
y_test_plot = np.argmax(test_y, axis=1)

cm = confusion_matrix(y_test_plot, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(8, 8))
plt.title('Confusion Matrix of RF Model on MAPTask Dataset')
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels=train_dataset.features["Label"].names, yticklabels=train_dataset.features["Label"].names)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

AxisError: axis 1 is out of bounds for array of dimension 1