# ML4NLP1
## Starting Point for Exercise 1, part II

This notebook is supposed to serve as a starting point and/or inspiration when starting exercise 1, part II.

One of the goals of this exercise is o make you acquainted with **skorch**. You will probably need to consult the [documentation](https://skorch.readthedocs.io/en/stable/).

# Installing skorch and loading libraries

In [3]:
import subprocess

# Installation on Google Colab
try:
    import google.colab
    subprocess.run(['python', '-m', 'pip', 'install', 'skorch'])
except ImportError:
    pass

In [4]:
import torch
from torch import nn
import torch.nn.functional as F
from skorch import NeuralNetClassifier

In [5]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [6]:
import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict

## Training a classifier and making predictions

In [8]:
# download dataset
!gdown 1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs # x_train
!gdown 1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6 # x_test
!gdown 1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl # y_train
!gdown 1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X # y_test

Downloading...
From: https://drive.google.com/uc?id=1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs
To: /content/x_train.txt
100% 64.1M/64.1M [00:01<00:00, 39.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6
To: /content/x_test.txt
100% 65.2M/65.2M [00:00<00:00, 186MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl
To: /content/y_train.txt
100% 480k/480k [00:00<00:00, 18.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X
To: /content/y_test.txt
100% 480k/480k [00:00<00:00, 18.0MB/s]


In [9]:
with open(f'x_train.txt') as f:
    x_train = f.read().splitlines()
with open(f'y_train.txt') as f:
    y_train = f.read().splitlines()
with open(f'x_test.txt') as f:
    x_test = f.read().splitlines()
with open(f'y_test.txt') as f:
    y_test = f.read().splitlines()

In [10]:
import pandas as pd
# combine x_train and y_train into one dataframe
train_df = pd.DataFrame({'text': x_train, 'label': y_train})

#combine x_test and y_test into one dataframe
test_df = pd.DataFrame({'text': x_test, 'label': y_test})

In [11]:
# T: Please use again the train/test data that includes English, German, Dutch, Danish, Swedish and Norwegian, plus 20 additional languages of your choice (the labels can be found in the file labels.csv)
# and adjust the train/test split if needed
from sklearn.model_selection import train_test_split
all_df = pd.DataFrame({'text': x_train + x_test, 'label': y_train + y_test})
train_df_re, test_df_re = train_test_split(all_df, test_size=0.2, stratify=all_df['label'])
train_df_re, valid_df = train_test_split(train_df_re, test_size=0.2, stratify=train_df_re['label'])

In [12]:
#lables
decided_labels = ['eng', 'deu', 'nld', 'dan', 'swe', 'nno', 'jpn']

additional_labels = ['lzh', 'kor', 'ell', 'fra', 'isl',
                     'ita', 'pol', 'spa', 'rus', 'ara',
                     'ang', 'fas', 'lat', 'por', 'hye',
                     'tur', 'chr', 'ind',  'zea', 'hat']

selected_labels = decided_labels + additional_labels

train_subset = train_df_re[train_df_re['label'].isin(selected_labels)]
valid_subset = valid_df[valid_df['label'].isin(selected_labels)]
test_subset = test_df_re[test_df_re['label'].isin(selected_labels)]

In [13]:
train_subset = train_subset.copy()
valid_subset = valid_subset.copy()
test_subset = test_subset.copy()

In [14]:
#Clean text
train_subset['text'] = train_subset['text'].str.lower()
valid_subset['text'] = valid_subset['text'].str.lower()
test_subset['text'] = test_subset['text'].str.lower()

In [15]:
#Clean text
def remove_punctuation_and_numbers(text):
    return re.sub(r'[^\w\s]|_', '', text)

train_subset['text'] = train_subset['text'].apply(remove_punctuation_and_numbers)
valid_subset['text'] = valid_subset['text'].apply(remove_punctuation_and_numbers)
test_subset['text'] = test_subset['text'].apply(remove_punctuation_and_numbers)

In [16]:
#Label encode
from sklearn.preprocessing import LabelEncoder
le_fitted = LabelEncoder().fit(train_subset['label'])

y_train_subset = le_fitted.transform(train_subset['label'])
y_valid_subset = le_fitted.transform(valid_subset['label'])
y_test_subset = le_fitted.transform(test_subset['label'])

Original version

In [1]:
# T: In the following, you can find a small (almost) working example of a neural network. Unfortunately, again, the cat messed up some of the code. Please fix the code such that it is executable.

In [17]:
# First, we extract some simple features as input for the neural network

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,2), max_features=100, binary=True)
X = vectorizer.fit_transform(train_subset['text'].to_numpy())

In [18]:
# Validation dataset
import skorch
from skorch.helper import predefined_split
from skorch.dataset import Dataset
X_valid = vectorizer.transform(valid_subset['text'].to_numpy())
X_valid = X_valid.astype(np.float32)
y_valid = y_valid_subset.astype(np.int64)
valid_ds = Dataset(X_valid, y_valid)

In [19]:
X = X.astype(np.float32)
y = y_train_subset.astype(np.int64)

In the following, we define a vanilla neural network with two hidden layers. The output layer should have as many outputs as there are classes. In addition, it should have a nonlinearity function.

In [20]:
class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_units=200,
            nonlin=F.relu,
            num_classes=27
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units

        self.dense0 = nn.Linear(100, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 50)
        self.output = nn.Linear(50, num_classes)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = F.relu(self.dense1(X))
        X = self.output(X)
        return X

In [21]:
X = X.astype(np.float32)
y = y_train_subset.astype(np.int64)

In [22]:
# Use CPU
net = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    train_split=predefined_split(valid_ds),
    device = 'cuda' if torch.cuda.is_available() else 'cpu' # comment this to train with CPU
)

In [23]:
net.fit(X, y)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m3.0073[0m       [32m0.2785[0m        [35m2.6321[0m  1.9663
      2        [36m2.1372[0m       [32m0.4315[0m        [35m1.7380[0m  1.7973
      3        [36m1.4961[0m       [32m0.5338[0m        [35m1.3481[0m  2.2392
      4        [36m1.2529[0m       [32m0.5745[0m        [35m1.1875[0m  2.9329
      5        [36m1.1380[0m       [32m0.6007[0m        [35m1.1044[0m  3.3591
      6        [36m1.0712[0m       [32m0.6322[0m        [35m1.0563[0m  1.8145
      7        [36m1.0253[0m       [32m0.6410[0m        [35m1.0235[0m  1.7978
      8        [36m0.9895[0m       [32m0.6486[0m        [35m0.9984[0m  1.7899
      9        [36m0.9608[0m       [32m0.6528[0m        [35m0.9810[0m  1.7985
     10        [36m0.9387[0m       [32m0.6551[0m        [35m0.9694[0m  1.7577
     11        [36m0.9220[0m       [32m0.65

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=100, out_features=200, bias=True)
    (dense1): Linear(in_features=200, out_features=50, bias=True)
    (output): Linear(in_features=50, out_features=27, bias=True)
  ),
)

Max_features_changed

In [None]:
# Changed the max_features to 1000
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,2), max_features=1000, binary=True)
X = vectorizer.fit_transform(train_subset['text'].to_numpy())

In [None]:
# Validation dataset
import skorch
from skorch.helper import predefined_split
from skorch.dataset import Dataset
X_valid = vectorizer.transform(valid_subset['text'].to_numpy())
X_valid = X_valid.astype(np.float32)
y_valid = y_valid_subset.astype(np.int64)
valid_ds = Dataset(X_valid, y_valid)

In [None]:
X = X.astype(np.float32)
y = y_train_subset.astype(np.int64)

In [None]:
#Check shape
print(X.shape)

(21600, 100)


In [None]:
#Check num_classes
print(np.unique(y))
n_classes = len(np.unique(y))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26]


In [None]:
#Changed the num_classes, and self.dense0 == nn.Linear(100, num_units) based on the changing of max_features
class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_units=200,
            nonlin=F.relu,
            num_classes=27
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units

        self.dense0 = nn.Linear(1000, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 50)
        self.output = nn.Linear(50, num_classes)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = F.relu(self.dense1(X))
        X = self.output(X)
        return X

In [None]:

net = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    train_split=predefined_split(valid_ds),
    device = 'cuda' if torch.cuda.is_available() else 'cpu' # comment this to train with CPU
)

In [None]:
net.fit(X, y)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m2.6837[0m       [32m0.6813[0m        [35m1.4990[0m  3.6681
      2        [36m0.8702[0m       [32m0.8977[0m        [35m0.4790[0m  5.0055
      3        [36m0.3489[0m       [32m0.9551[0m        [35m0.2761[0m  4.4568
      4        [36m0.2234[0m       [32m0.9616[0m        [35m0.2184[0m  3.4463
      5        [36m0.1766[0m       [32m0.9641[0m        [35m0.1904[0m  3.2655
      6        [36m0.1484[0m       0.9630        [35m0.1723[0m  5.6223
      7        [36m0.1274[0m       [32m0.9644[0m        [35m0.1599[0m  3.6078
      8        [36m0.1116[0m       0.9644        [35m0.1517[0m  3.4585
      9        [36m0.0995[0m       [32m0.9650[0m        [35m0.1464[0m  3.4328
     10        [36m0.0901[0m       [32m0.9667[0m        [35m0.1428[0m  4.4043
     11        [36m0.0824[0m       [32m0.9669[0m        [35

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=1000, out_features=200, bias=True)
    (dense1): Linear(in_features=200, out_features=50, bias=True)
    (output): Linear(in_features=50, out_features=27, bias=True)
  ),
)

In [None]:
#Weights Initialization
def weights_init(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)

In [None]:
net = skorch.NeuralNetClassifier(
    ClassifierModule,
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    train_split=predefined_split(valid_ds),
    device='cuda' if torch.cuda.is_available() else 'cpu',
    callbacks=[('weights_init', skorch.callbacks.Initializer('*.*', fn=weights_init))]
    )

In [None]:
net.fit(X,y)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m2.7115[0m       [32m0.7461[0m        [35m1.4827[0m  4.2845
      2        [36m0.8410[0m       [32m0.8831[0m        [35m0.4673[0m  3.3845
      3        [36m0.3558[0m       [32m0.9556[0m        [35m0.2798[0m  3.5473
      4        [36m0.2285[0m       [32m0.9576[0m        [35m0.2188[0m  4.5250
      5        [36m0.1777[0m       [32m0.9600[0m        [35m0.1905[0m  3.3483
      6        [36m0.1486[0m       [32m0.9616[0m        [35m0.1725[0m  3.3457
      7        [36m0.1280[0m       [32m0.9625[0m        [35m0.1599[0m  4.2898
      8        [36m0.1127[0m       [32m0.9632[0m        [35m0.1509[0m  3.5599
      9        [36m0.1009[0m       [32m0.9634[0m        [35m0.1445[0m  3.4065
     10        [36m0.0915[0m       [32m0.9641[0m        [35m0.1396[0m  3.3097
     11        [36m0.0838[0m       [32m0.96

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=1000, out_features=200, bias=True)
    (dense1): Linear(in_features=200, out_features=50, bias=True)
    (output): Linear(in_features=50, out_features=27, bias=True)
  ),
)

In [None]:
# Add earlystop
from skorch.callbacks import EarlyStopping
net = skorch.NeuralNetClassifier(
    ClassifierModule,
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    train_split=predefined_split(valid_ds),
    device='cuda' if torch.cuda.is_available() else 'cpu',
    callbacks=[
        ('weights_init', skorch.callbacks.Initializer('*.*', fn=weights_init)),
        ('early_stopping', EarlyStopping(patience=5))
    ],
    verbose=0
)

In [None]:
#Gridsearch
from sklearn.model_selection import GridSearchCV
param_grid = {
    'module__num_units': [100, 200, 300],  # Layer Sizes
    'module__nonlin': [F.relu, torch.sigmoid],  # Activation Functions
    'lr': [1, 0.1],  # Learning Rate
    'max_epochs': [10, 20],  # Epochs
    'optimizer': [torch.optim.SGD, torch.optim.Adam],# Optimizers

}

gs = GridSearchCV(net, param_grid, refit=True, cv=3, scoring='accuracy')
gs.fit(X, y)

print("Best parameters set found on development set:")
print(gs.best_params_)


Best parameters set found on development set:
{'lr': 0.1, 'max_epochs': 20, 'module__nonlin': <function relu at 0x7a6a85279b40>, 'module__num_units': 200, 'optimizer': <class 'torch.optim.sgd.SGD'>}


In [None]:
# Update parameters
net = skorch.NeuralNetClassifier(
    ClassifierModule,

    criterion=nn.CrossEntropyLoss(),

    module__num_units=200,
    module__nonlin=F.relu,
    lr=0.1,
    max_epochs=20,
    optimizer=torch.optim.SGD,

    train_split=predefined_split(valid_ds),
    device='cuda' if torch.cuda.is_available() else 'cpu',
    callbacks=[
        ('weights_init', skorch.callbacks.Initializer('*.*', fn=weights_init)),
        ('early_stopping', EarlyStopping(patience=5))
    ]
)


In [None]:
net.fit(X,y)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m2.7510[0m       [32m0.6664[0m        [35m1.6341[0m  3.5182
      2        [36m0.9239[0m       [32m0.8609[0m        [35m0.5146[0m  3.7639
      3        [36m0.3751[0m       [32m0.9512[0m        [35m0.2975[0m  3.9592
      4        [36m0.2393[0m       [32m0.9535[0m        [35m0.2326[0m  3.4345
      5        [36m0.1848[0m       [32m0.9574[0m        [35m0.2008[0m  3.4058
      6        [36m0.1539[0m       [32m0.9588[0m        [35m0.1806[0m  4.3953
      7        [36m0.1320[0m       [32m0.9597[0m        [35m0.1666[0m  3.4007
      8        [36m0.1156[0m       [32m0.9618[0m        [35m0.1571[0m  3.3969
      9        [36m0.1031[0m       [32m0.9632[0m        [35m0.1505[0m  3.8815
     10        [36m0.0933[0m       [32m0.9639[0m        [35m0.1457[0m  4.0868
     11        [36m0.0853[0m       [32m0.96

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=1000, out_features=200, bias=True)
    (dense1): Linear(in_features=200, out_features=50, bias=True)
    (output): Linear(in_features=50, out_features=27, bias=True)
  ),
)

In [None]:
# Evaluate the result
X_train = vectorizer.transform(train_subset['text'].to_numpy()).astype(np.float32)
X_test = vectorizer.transform(test_subset['text'].to_numpy()).astype(np.float32)

In [None]:
train_accuracy = net.score(X_train, y_train_subset)
valid_accuracy = net.score(X_valid, y_valid_subset)
test_accuracy = net.score(X_test, y_test_subset)

print(f"Training Accuracy: {train_accuracy*100:.2f}%")
print(f"Validation Accuracy: {valid_accuracy*100:.2f}%")
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

Training Accuracy: 98.99%
Validation Accuracy: 96.50%
Test Accuracy: 96.41%
