# Getting Started

## Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sn

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from scipy.io import wavfile

## Load Data

### Labels

In [None]:
labels = pd.read_csv('data/labels.csv')
labels.head()

Here, id is the id and file name of the audio. The call_id is the id of the person who is calling. So one person can have multiple audio files. The column text is the transcription of the audio in Dutch. The column intent indicates whether the intention of the caller was positive or negative.

### Audio 

In [None]:
sample_rates = []
data = []
for label_id in labels.id:
    file_name_path = f"data/{label_id}.wav"
    sample_rate, data_ = wavfile.read(file_name_path)
    sample_rates.append(sample_rate)
    data.append(data_)

In [None]:
print(sample_rates[:5])
for data_ in data[:5]:
    print(data_.shape)

## Preprocessing

### Zero-padding

In [None]:
max_length = max([len(data_) for data_ in data])
data = [np.pad(data_, (0, max_length - len(data_)), mode='constant') for data_ in data]
data = np.array(data)
print(data.shape)

### Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, labels['intent'], test_size=0.2, random_state=42)

## Training

In [None]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

## Testing

In [None]:
predictions = logistic_regression.predict(X_test)

## Evaluation

In [None]:
cm = confusion_matrix(y_test, predictions)
sn.heatmap(cm, annot=True, fmt='d')
print(classification_report(y_test, predictions))