# ML4NLP1
## Starting Point for Exercise 1, part II

This notebook is supposed to serve as a starting point and/or inspiration when starting exercise 1, part II.

One of the goals of this exercise is o make you acquainted with **skorch**. You will probably need to consult the [documentation](https://skorch.readthedocs.io/en/stable/).

# Installing skorch and loading libraries

In [None]:
import subprocess

# Installation on Google Colab
try:
    import google.colab
    subprocess.run(['python', '-m', 'pip', 'install', 'skorch'])
except ImportError:
    pass

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from skorch import NeuralNetClassifier

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [None]:
import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict

## Training a classifier and making predictions

In [None]:
# download dataset
!gdown 1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs # x_train
!gdown 1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6 # x_test
!gdown 1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl # y_train
!gdown 1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X # y_test

Downloading...
From: https://drive.google.com/uc?id=1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs
To: /content/x_train.txt
100% 64.1M/64.1M [00:01<00:00, 41.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6
To: /content/x_test.txt
100% 65.2M/65.2M [00:00<00:00, 70.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl
To: /content/y_train.txt
100% 480k/480k [00:00<00:00, 103MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X
To: /content/y_test.txt
100% 480k/480k [00:00<00:00, 128MB/s]


In [None]:
with open(f'x_train.txt') as f:
    x_train = f.read().splitlines()
with open(f'y_train.txt') as f:
    y_train = f.read().splitlines()
with open(f'x_test.txt') as f:
    x_test = f.read().splitlines()
with open(f'y_test.txt') as f:
    y_test = f.read().splitlines()

In [None]:
import pandas as pd
# combine x_train and y_train into one dataframe
train_df = pd.DataFrame({'text': x_train, 'label': y_train})

#combine x_test and y_test into one dataframe
test_df = pd.DataFrame({'text': x_test, 'label': y_test})

In [None]:
# T: Please use again the train/test data that includes English, German, Dutch, Danish, Swedish and Norwegian, plus 20 additional languages of your choice (the labels can be found in the file labels.csv)
# and adjust the train/test split if needed


import pandas as pd
from sklearn.model_selection import train_test_split
test_df_to_train, test_df_remaining = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df['label'])

train_df = pd.concat([train_df, test_df_to_train], ignore_index=True)

test_df = test_df_remaining

fixed_languages = ['eng', 'deu', 'nld', 'dan', 'swe', 'nor']
self_selected_languages = ['est', 'tha', 'guj', 'tam', 'vie', 'lat', 'urd', 'por', 'fra', 'rus', 'ara', 'heb', 'hin', 'jpn', 'kor', 'zho', 'spa', 'ita', 'tur', 'ell']
final_language_list = fixed_languages + self_selected_languages
sub_traindf = train_df[train_df['label'].isin(final_language_list)]
sub_testdf = test_df[test_df['label'].isin(final_language_list)]


In [None]:
# T: use your adjusted code to encode the labels here
# T: With the following code, we wanted to encode the labels, however, our cat was walking on the keyboard and some of it got changed. Can you fix it?
from sklearn.preprocessing import LabelEncoder
le_fitted = LabelEncoder().fit(sub_traindf['label'])
y_train_dev, y_test = le_fitted.transform(sub_traindf['label']), le_fitted.transform(sub_testdf['label'])


In [None]:
# T: In the following, you can find a small (almost) working example of a neural network. Unfortunately, again, the cat messed up some of the code. Please fix the code such that it is executable.

In [None]:
# First, we extract some simple features as input for the neural network
# import package:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2), max_features=1000, binary=True)
X = vectorizer.fit_transform(sub_traindf['text'].to_numpy())

In [None]:
X = X.astype(np.float32)
y = y_train_dev.astype(np.int64)

In the following, we define a vanilla neural network with two hidden layers. The output layer should have as many outputs as there are classes. In addition, it should have a nonlinearity function.

In [None]:
class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_units=200,
            nonlin=F.relu,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(1000, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 50)
        self.output = nn.Linear(50, 25)

    def forward(self, X, **kwargs):
      X = self.nonlin(self.dense0(X))
      X = F.relu(self.dense1(X))
      X = self.output(X)
      return X.squeeze(dim=1)

In [None]:

from sklearn.model_selection import GridSearchCV
from skorch.callbacks import EarlyStopping
net = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    callbacks=[('earlystopping', EarlyStopping(patience=5))],
    lr=0.1,
    #device='cuda',  # comment this to train with CPU
)
params = {
    'optimizer': [torch.optim.SGD, torch.optim.Adam],
    'module__nonlin':[F.relu, F.tanh],
    'module__num_units': [800,200],
    'callbacks__earlystopping__patience':[5,10]
}

# Note: Consider using StratifiedKFold for classification tasks
gs = GridSearchCV(net, params, refit=True, cv=3, scoring='accuracy', verbose=3)

In [None]:
print(X.shape)
print(y_train_dev.shape)

(18750, 1000)
(18750,)


In [None]:
gs.fit(X, y_train_dev)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m2.8769[0m       [32m0.4356[0m        [35m2.4801[0m  2.2969
      2        [36m1.4343[0m       [32m0.5916[0m        [35m1.3274[0m  2.2242
      3        [36m0.5960[0m       [32m0.8956[0m        [35m0.4880[0m  2.3504
      4        [36m0.3404[0m       [32m0.9496[0m        [35m0.3138[0m  2.9736
      5        [36m0.2463[0m       [32m0.9556[0m        [35m0.2495[0m  2.1917
      6        [36m0.1972[0m       [32m0.9592[0m        [35m0.2166[0m  2.1699
      7        [36m0.1673[0m       [32m0.9604[0m        [35m0.1973[0m  2.2055
      8        [36m0.1465[0m       [32m0.9616[0m        [35m0.1844[0m  2.1756
      9        [36m0.1302[0m       [32m0.9624[0m        [35m0.1747[0m  3.2393
     10        [36m0.1164[0m       [32m0.9640[0m        [35m0.16

In [None]:
from sklearn.metrics import classification_report
# First, we extract some simple features as input for the neural network
# import package:
from sklearn.feature_extraction.text import CountVectorizer

X_test = vectorizer.transform(sub_testdf['text'].to_numpy())

X_test = X_test.astype(np.float32)
y_test = y_test.astype(np.int64)
y_pred = gs.predict(X_test)

print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       250
           1       0.97      0.97      0.97       250
           2       0.94      0.96      0.95       250
           3       1.00      0.98      0.99       250
           4       0.86      0.97      0.91       250
           5       0.98      0.97      0.97       250
           6       0.97      0.98      0.98       250
           7       0.99      0.97      0.98       250
           8       1.00      1.00      1.00       250
           9       1.00      0.98      0.99       250
          10       0.96      0.98      0.97       250
          11       0.98      0.76      0.86       250
          12       1.00      0.98      0.99       250
          13       0.95      0.96      0.96       250
          14       0.98      0.99      0.99       250
          15       0.99      0.94      0.97       250
          16       0.98      1.00      0.99       250
          17       0.96    

In [None]:
# max_feature is 100 here(ablation)
class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_units=200,
            nonlin=F.relu,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(100, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 50)
        self.output = nn.Linear(50, 25)

    def forward(self, X, **kwargs):
      X = self.nonlin(self.dense0(X))
      X = F.relu(self.dense1(X))
      X = self.output(X)
      return X.squeeze(dim=1)

net2 = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    callbacks=[('earlystopping', EarlyStopping(patience=5))],
    lr=0.1,
    #device='cuda',  # comment this to train with CPU
)
params2 = {
    'optimizer': [torch.optim.SGD, torch.optim.Adam],
    'module__nonlin':[F.relu, F.tanh],
    'module__num_units': [800,200],
    'callbacks__earlystopping__patience':[5,10]
}
vectorizer2 = CountVectorizer(analyzer='char', ngram_range=(2, 2), max_features=100, binary=True)
X_100 = vectorizer2.fit_transform(sub_traindf['text'].to_numpy())
X_100 = X_100.astype(np.float32)
y = y_train_dev.astype(np.int64)
# Note: Consider using StratifiedKFold for classification tasks
gs2 = GridSearchCV(net2, params2, refit=True, cv=3, scoring='accuracy', verbose=3)
gs2.fit(X_100, y_train_dev)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m2.9187[0m       [32m0.1332[0m        [35m2.8896[0m  2.0389
      2        [36m2.3096[0m       [32m0.2028[0m        [35m2.3778[0m  1.5750
      3        [36m1.8023[0m       [32m0.2696[0m        [35m1.9711[0m  1.6166
      4        [36m1.5523[0m       [32m0.3812[0m        [35m1.7371[0m  1.5557
      5        [36m1.3838[0m       [32m0.4268[0m        [35m1.6145[0m  1.5697
      6        [36m1.2665[0m       [32m0.4736[0m        [35m1.5108[0m  1.5597
      7        [36m1.1934[0m       [32m0.4984[0m        [35m1.4275[0m  1.8590
      8        [36m1.1447[0m       [32m0.5152[0m        [35m1.3647[0m  2.3187
      9        [36m1.1098[0m       [32m0.5428[0m        [35m1.3164[0m  1.5771
     10        [36m1.0829[0m       [32m0.5520[0m        [35m1.27

In [None]:
from sklearn.metrics import classification_report
# First, we extract some simple features as input for the neural network
# import package:
from sklearn.feature_extraction.text import CountVectorizer

X_test_2 = vectorizer2.transform(sub_testdf['text'].to_numpy())

X_test_2 = X_test_2.astype(np.float32)
y_test = y_test.astype(np.int64)
y_pred = gs2.predict(X_test_2)

print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.30      0.46      0.36       250
           1       0.90      0.80      0.84       250
           2       0.92      0.78      0.85       250
           3       0.32      0.05      0.09       250
           4       0.82      0.83      0.82       250
           5       0.77      0.96      0.86       250
           6       0.90      0.91      0.90       250
           7       0.45      0.29      0.35       250
           8       0.26      0.06      0.10       250
           9       0.43      0.49      0.46       250
          10       0.92      0.91      0.91       250
          11       0.45      0.04      0.07       250
          12       0.29      0.62      0.39       250
          13       0.83      0.86      0.85       250
          14       0.82      0.91      0.86       250
          15       0.91      0.85      0.88       250
          16       0.24      0.35      0.29       250
          17       0.88    

In [None]:
best_parameters = gs.best_params_


In [None]:
print(best_parameters)

{'callbacks__earlystopping__patience': 10, 'module__nonlin': <function tanh at 0x7ffa9858af80>, 'module__num_units': 800, 'optimizer': <class 'torch.optim.sgd.SGD'>}
