In [2]:
from sklearn.model_selection import train_test_split
from google.colab import files
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

###Loading Kaggle dataset

In [3]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"gogagor","key":"3d858e77c55c31ef67bc65b261ced2b2"}'}

In [4]:
!pip install kaggle



In [5]:
auth.authenticate_user()

drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])

filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)

request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

Download 100%.


In [6]:
!mv .kaggle /root/

In [7]:
!kaggle datasets download -d shoumikgoswami/annotated-gmb-corpus

Downloading annotated-gmb-corpus.zip to /content
  0% 0.00/462k [00:00<?, ?B/s]
100% 462k/462k [00:00<00:00, 68.4MB/s]


In [8]:
!ls

adc.json  annotated-gmb-corpus.zip  kaggle.json  sample_data


In [9]:
!unzip annotated-gmb-corpus.zip

Archive:  annotated-gmb-corpus.zip
  inflating: GMB_dataset.txt         


In [10]:
!ls

adc.json  annotated-gmb-corpus.zip  GMB_dataset.txt  kaggle.json  sample_data


In [11]:
my_data = pd.read_csv('GMB_dataset.txt', sep = '\t',encoding='latin1')
my_data = my_data.set_index('Unnamed: 0')

In [12]:
n_s = 1.0
data = [] 
targets = []
words = []
for index, row in my_data.iterrows():
  if n_s == row['Sentence #']:
    words.append(row['Word'])
    targets.append(row['POS']) 
  else:
    n_s = row['Sentence #']
    data.append((words, targets))
    targets = []
    words = []

###Preparing training data

In [13]:
word_to_ix = {}
tag_to_ix = {}
for sent, tags in data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

In [14]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [15]:
train_data, test_data = train_test_split(data, test_size = 0.3)

###Create LSTM model

In [16]:
torch.manual_seed(1)
EMBEDDING_DIM = 64
HIDDEN_DIM = 64

In [17]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        self.hidden = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        char_space = self.hidden(lstm_out.view(len(sentence), -1))
        target_scores = F.log_softmax(char_space, dim=1)
        return target_scores

In [18]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix)).cuda()
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(30): 
    print('Epoch:' , epoch)
    losses = []
    for sentence, tags in train_data:

        model.zero_grad()

        # Tensors of word indices
        sent_in = prepare_sequence(sentence, word_to_ix).cuda()
        targets = prepare_sequence(tags, tag_to_ix).cuda()

        # Run the forward pass
        target_scores = model(sent_in)

        # Compute the loss, gradients, and update the parameters
        loss = loss_function(target_scores, targets)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print("Loss: {:.4f}".format(loss.item()))

Epoch: 0
Loss: 1.4819
Epoch: 1
Loss: 0.9427
Epoch: 2
Loss: 0.6762
Epoch: 3
Loss: 0.5767
Epoch: 4
Loss: 0.5194
Epoch: 5
Loss: 0.4787
Epoch: 6
Loss: 0.4580
Epoch: 7
Loss: 0.4468
Epoch: 8
Loss: 0.4238
Epoch: 9
Loss: 0.3915
Epoch: 10
Loss: 0.3587
Epoch: 11
Loss: 0.3257
Epoch: 12
Loss: 0.2900
Epoch: 13
Loss: 0.2550
Epoch: 14
Loss: 0.2237
Epoch: 15
Loss: 0.1969
Epoch: 16
Loss: 0.1728
Epoch: 17
Loss: 0.1502
Epoch: 18
Loss: 0.1299
Epoch: 19
Loss: 0.1116
Epoch: 20
Loss: 0.0954
Epoch: 21
Loss: 0.0814
Epoch: 22
Loss: 0.0694
Epoch: 23
Loss: 0.0592
Epoch: 24
Loss: 0.0507
Epoch: 25
Loss: 0.0437
Epoch: 26
Loss: 0.0380
Epoch: 27
Loss: 0.0333
Epoch: 28
Loss: 0.0293
Epoch: 29
Loss: 0.0259


###Evaluating the model

In [21]:
with torch.no_grad():
    predicted_tags = []
    total_tags = 0
    correct_tags = 0
    for sentence, targets in test_data:
        inputs = prepare_sequence(sentence, word_to_ix).cuda()
        target_scores = model(inputs)
        targets = prepare_sequence(targets, tag_to_ix).cuda()
        predicted_tags = [ts.index(max(ts)) for ts in target_scores.tolist()]
        total_tags += len(targets)
        for i,tag in enumerate(predicted_tags):
            if tag == targets[i]:
                correct_tags += 1
    print('Accuracy: ', correct_tags / total_tags)

Accuracy:  0.8510883818515604


#####10 epochs - 83%
#####15 epochs - 84.53%
#####20 epochs - 84.87%
###25 epochs - 85.65%
#####35 epochs - 85.24%
