In [None]:
!pip install torch==1.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.4
  Downloading torch-1.4.0-cp37-cp37m-manylinux1_x86_64.whl (753.4 MB)
[K     |████████████████████████████████| 753.4 MB 7.7 kB/s 
[?25hInstalling collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+cu113 requires torch==1.11.0, but you have torch 1.4.0 which is incompatible.
torchtext 0.12.0 requires torch==1.11.0, but you have torch 1.4.0 which is incompatible.
torchaudio 0.11.0+cu113 requires torch==1.11.0, but you have torch 1.4.0 which is incompatible.
fastai 2.6.3 requires torch<1.12,>=1.7.0, but you have torch 1.4.0 which 

In [None]:
from collections import defaultdict
from argparse import ArgumentParser
from gensim.models import KeyedVectors

In [None]:
import json
import numpy as np
import en_core_web_sm
from itertools import groupby

In [None]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as U
import torch.nn.functional as F

In [None]:
import math
from tqdm import tqdm
from collections import defaultdict

In [None]:
from train import *
from models import *
from utils.predict_util import *
from utils.data_util import *
from utils.train_util import prepare_offence

In [None]:
data_path = "data/"                   # Folder to store dataset
train_file = "Train-Sent.jsonl"       # Train dataset
test_file = "Test-Doc.jsonl"          # Test dataset
label_file = "Labels.jsonl"           # Charge descriptions
save_path = "saved/"                  # Folder to store trained model and metrics
pretrained = "ptembs/word2vec.kv"     # Pretrained word2vec embeddings file
                                      # [embedding dimensions must match!]
                                      # use 'None' for no pretrained initialization
label_wts = True                      # Use weighted loss function
vocab_thresh = 2                      # Min frequency for a word to be included in vocabulary
embed_dim = 128                       # Embedding dimension
epochs = 200                          # Number of training epochs
batch_size = 5                        # Batch size
device = 'cuda'                       # Device (cuda/cpu)
lr = 1e-3                             # Learning rate
l2reg = 5e-4                          # L2 Regularization penalty
lr_patience = 5                       # Number of epochs of non-increasing performance 
                                      # to wait before reducing learning rate
                                      # use -1 for fixed learning rate
lr_factor = 0.5                       # Factor to reduce learning rate by
print_every = 1                       # Epoch interval after which metrics will be printed

In [None]:
# Loading and tokenizing fact and charge descriptions
traindev_data = jsonl_to_list(data_path + train_file)
test_data = jsonl_to_list(data_path + test_file)
label_data = jsonl_to_list(data_path + label_file)

120it [00:13,  9.10it/s]
70it [00:08,  8.19it/s]
20it [00:00, 29.16it/s]


In [None]:
num_docs = len(traindev_data)
num_sents = len(sum([doc['text'] for doc in traindev_data], []))

In [None]:
print("Creating vocab...")
word_freq = defaultdict(int)
sent_label_freq = defaultdict(int)
doc_label_freq = defaultdict(int)

Creating vocab...


In [None]:
calc_freq(traindev_data, word_freq, sent_label_freq, doc_label_freq)
calc_freq(label_data, word_freq)

In [None]:
label_vocab = get_label_vocab(label_data)
vocab = get_vocab(word_freq)
ptemb_matrix = None

In [None]:
# Tokenizing all data
tokenize_dataset(traindev_data, vocab, label_vocab)
tokenize_dataset(test_data, vocab, label_vocab)
tokenize_dataset(label_data, vocab, label_vocab)

In [None]:
# Getting label weights
if label_wts:
    sent_label_wts = torch.from_numpy(get_label_weights(label_vocab, sent_label_freq, num_sents)).to(device)
    doc_label_wts = torch.from_numpy(get_label_weights(label_vocab, doc_label_freq, num_docs)).to(device)
else:
    sent_label_wts = None
    doc_label_wts = None

In [None]:
# Preparing label data and model
charges = prepare_offence(label_data)

In [None]:
model = COClassifier(len(vocab), embed_dim, len(label_vocab), charges['offence'], charges['sent_lens'], charges['doc_lens'], device, sent_label_wts, doc_label_wts, ptemb_matrix).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=l2reg)

In [None]:
scheduler = None
if lr_patience != -1:
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=lr_patience, factor=lr_factor, verbose=True)

In [None]:
metrics, model1 = train(model, traindev_data, test_data, optimizer, 
    lr_scheduler=scheduler, epochs=epochs, batch_size=batch_size, device=device)

Training...

EPOCH ||  Tr-LOSS |    Tr-F1 ||  Dv-LOSS |     Dv-P     Dv-R    Dv-F1
    1 ||   2.8759 |   0.1262 ||   1.2815 |   0.1015   0.6146   0.1231
    2 ||   2.7575 |   0.1431 ||   1.2664 |   0.1284   0.6821   0.1887
    3 ||   2.6921 |   0.1400 ||   1.2742 |   0.1463   0.5368   0.1558
    4 ||   2.6646 |   0.1321 ||   1.2522 |   0.1049   0.5524   0.1551
    5 ||   2.6354 |   0.1544 ||   1.2652 |   0.1309   0.5597   0.1979
    6 ||   2.6215 |   0.1547 ||   1.2451 |   0.1613   0.5236   0.1961
    7 ||   2.5453 |   0.1726 ||   1.2345 |   0.1768   0.5415   0.2159
    8 ||   2.5312 |   0.2477 ||   1.2077 |   0.1844   0.6824   0.2549
    9 ||   2.4395 |   0.2166 ||   1.1828 |   0.2136   0.6181   0.2977
   10 ||   2.3500 |   0.2430 ||   1.1347 |   0.2010   0.7150   0.2814
   11 ||   2.2739 |   0.2950 ||   1.1856 |   0.2244   0.6613   0.2738
   12 ||   2.2648 |   0.2792 ||   1.1800 |   0.2454   0.6340   0.3183
   13 ||   2.1529 |   0.3350 ||   1.1001 |   0.2311   0.6762   0.3209
   14 |

In [None]:
with open(save_path + "metrics.json", 'w') as fw:
    json.dump(metrics, fw)
torch.save(model1, save_path + "model.pt")

In [None]:
model = COClassifier(len(vocab), embed_dim, len(label_vocab), charges['offence'], charges['sent_lens'], charges['doc_lens'], device, sent_label_wts, doc_label_wts, ptemb_matrix).to(device)
model.load_state_dict(torch.load('saved/model.pt'))

<All keys matched successfully>

In [None]:
preds = infer(model, test_data, label_vocab, batch_size=5, device=device)
preds

[['criminal conspiracy',
  'cheating',
  'offence against state',
  'forgery',
  'criminal breach of trust'],
 [],
 [],
 ['murder', 'sexual offence', 'kidnapping', 'offence against public justice'],
 ['hurt',
  'murder',
  'unlawful assembly',
  'mischief',
  'theft',
  'criminal intimidation',
  'robbery',
  'criminal trespass'],
 ['hurt',
  'murder',
  'unlawful assembly',
  'mischief',
  'robbery',
  'criminal trespass'],
 ['hurt', 'murder', 'unlawful assembly', 'criminal trespass'],
 ['hurt',
  'murder',
  'criminal conspiracy',
  'unlawful assembly',
  'mischief',
  'offence against public justice',
  'offence against state',
  'robbery'],
 ['marriage offence'],
 ['criminal conspiracy',
  'offence against state',
  'offence related to religion'],
 [],
 [],
 ['hurt',
  'murder',
  'criminal conspiracy',
  'unlawful assembly',
  'mischief',
  'robbery',
  'criminal trespass'],
 ['murder'],
 ['hurt', 'murder'],
 ['sexual offence'],
 ['hurt', 'murder', 'unlawful assembly', 'criminal t

In [None]:
td = {"text": ["On August 25, 1976 at about 12 noon P.W. 1 Syed Ameer, Supervisor, Karnataka Electricity Board, went to the house of the respondent on a routine inspection to check the electric meter installed there.",
               "He found the meter board at the entrance and though the meter was not recording consumption of electric energy, the lights and fans were on.",
               "It appeared that the respondent had tampered with the main connection by fixing two switches to the wall of the house and by operating the switches the lights and fans inside the house could be used without the meter recording any consumption.",
               "Later in the day, he along A with the Assistant Engineer attached to the Karnataka Electricity Board, Krishnarajanagar and the Junior Engineer went to the house of the respondent and saw that there was theft of electric energy."],
      "doc_labels": ["theft"]}
td['text'] = list(map(lambda x: tokenize_text(x), td['text']))
test_one = [td]
tokenize_dataset(test_one, vocab, label_vocab)
infer(model, test_one, label_vocab, batch_size=1, device=device)[0]

['theft', 'robbery']