In [1]:
# import json
# data = json.load(open('RR_Train.json'))

In [3]:
import pandas as pd
data = pd.read_csv('../../All_Data/Rhetorical_Role_Dataset/legal.csv')

In [3]:
data.columns

Index(['Text', 'Label', 'Type'], dtype='object')

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
bow_labels = {}

for i,row in data.iterrows():
  if row['Label'] not in bow_labels:
    bow_labels[row['Label']] = {}
  labelled_text = row['Text'].translate(str.maketrans('', '', string.punctuation))
  labelled_text_words = word_tokenize(labelled_text)
  for word in labelled_text_words:
    if word not in stop_words:
      if word not in bow_labels[row['Label']].keys():
        bow_labels[row['Label']][word] = 1
      else:
        bow_labels[row['Label']][word] += 1

Creates a dictionary of rhetorical role labels and the corresponding words appearing in the labelled sentences with their frequencies.

In [6]:
print(bow_labels["PREAMBLE"])
print(len(bow_labels))

{'IN': 199, 'THE': 488, 'HIGH': 106, 'COURT': 189, 'OF': 675, 'KARNATAKA': 61, 'CIRCUIT': 2, 'BENCH': 40, 'AT': 106, 'GULBARGADATED': 1, 'THIS': 114, '22ND': 3, 'DAY': 66, 'FEBRUARY': 6, '2013': 18, 'BEFORETHE': 8, 'HONBLE': 79, 'MRJUSTICE': 20, 'ANAND': 8, 'BYRAREDDY': 6, 'CRIMINAL': 96, 'APPEAL': 87, 'NO3532': 1, '2012BETWEEN': 1, 'RBabu': 1, 'So': 103, 'Siddappa': 3, 'APPELLANTAge': 1, '30': 15, 'Years': 14, 'Occ': 36, 'HouseholdRo': 3, 'Sunthan': 1, 'VillageTaluk': 1, 'ChincholiDistrict': 1, 'GulbargaBy': 1, 'Shri': 60, 'Ishwar': 1, 'Raj': 3, 'SChowdapur': 1, 'AdvocateANDThe': 1, 'State': 191, 'Karnataka': 30, 'RESPONDENTThrough': 2, 'Ratkal': 1, 'Police': 64, 'StationRepresented': 6, 'Additional': 17, 'StatePublic': 2, 'Prosecutor': 25, 'Circuit': 3, 'BenchGulbargaBy': 1, 'SSAspalli': 1, 'Government': 62, 'Pleader': 5, 'This': 64, 'Criminal': 106, 'Appeal': 47, 'filed': 65, 'Section': 105, '3742': 7, 'theCode': 6, 'Procedure': 47, '1973': 26, 'advocate': 6, 'theappellant': 20, 'pr

In [7]:
import heapq

freq_words_label = {}
for label in bow_labels.keys():
  freq_words = [(word, bow_labels[label][word]) for word in heapq.nlargest(30, bow_labels[label], key=bow_labels[label].get)]
  freq_words_label[label] = freq_words

Stores 30 most frequent words of each label in dictionary freq_words_label.

In [8]:
print(freq_words_label["PREAMBLE"])

[('OF', 675), ('The', 510), ('THE', 488), ('Court', 294), ('Act', 274), ('years', 217), ('1', 212), ('IN', 199), ('State', 191), ('2', 190), ('COURT', 189), ('appellant', 177), ('case', 164), ('v', 154), ('High', 154), ('3', 134), ('Sri', 133), ('Mr', 132), ('AND', 127), ('Commissioner', 127), ('appeal', 121), ('THIS', 114), ('Incometax', 114), ('JUDGMENT', 110), ('J', 108), ('HIGH', 106), ('AT', 106), ('Criminal', 106), ('TAX', 106), ('made', 106)]


In [9]:
unique_words = set()
for words_list in freq_words_label.values():
  words = [item[0] for item in words_list]
  unique_words.update(words)
print(len(unique_words))

159


In [10]:
document_frequencies = {}
for words_list in freq_words_label.values():
  for item in words_list:
    if item[0] not in document_frequencies:
      document_frequencies[item[0]] = 0
    document_frequencies[item[0]] += 1

In [11]:
print(document_frequencies)

{'The': 13, 'accused': 9, 'case': 12, 'Court': 13, 'evidence': 5, 'Act': 11, 'In': 7, 'also': 6, 'It': 7, 'Section': 11, 'made': 8, 'said': 7, 'would': 5, 'stated': 2, 'prosecution': 4, 'assessee': 6, 'order': 10, 'High': 9, '1': 8, 'tax': 5, 'appellant': 7, 'may': 3, '2': 8, 'house': 2, 'one': 2, 'section': 4, 'time': 3, 'PW1': 1, 'question': 5, 'view': 3, 'learned': 4, 'submitted': 2, 'counsel': 3, 'petitioner': 5, 'State': 6, 'He': 3, 'contended': 2, 'Vs': 1, 'therefore': 3, 'Learned': 1, 'upon': 2, 'India': 1, 'respondent': 1, 'hand': 1, 'court': 4, 'suit': 1, 'bail': 2, 'deceased': 2, 'filed': 2, 'On': 1, 'dated': 3, 'appeal': 4, '3': 4, 'No': 3, 'complainant': 1, 'persons': 1, 'Whether': 1, 'facts': 2, 'law': 3, '4': 1, 'circumstances': 2, 'whether': 2, 'following': 2, 'Tribunal': 2, 'Ltd': 3, 'consideration': 1, 'proves': 1, 'holding': 1, 'committed': 1, 'place': 1, 'entitled': 1, 'points': 1, 'Text': 1, 'Signature': 1, 'J': 2, 'Appeal': 2, 'List': 1, 'Judgment': 1, 'JUDGE': 1, 

In [12]:
from collections import Counter
import numpy as np

tf_idf_weights = {}
i = 0
for words_list in freq_words_label.values():
  words_dict = {key: value for key, value in words_list}
  for word in unique_words:
    document_frequency = 1 if document_frequencies.get(word) == None else document_frequencies[word]
    tf = 0 if word not in words_dict else 1 + np.log2(words_dict[word])
    idf = np.log2(len(freq_words_label)/document_frequency)
    tf_idf_weights[i, word] = tf * idf
  i += 1


In [13]:
print(tf_idf_weights)

{(0, 'appellant'): 9.826548487290914, (0, 'two'): 0.0, (0, 'offence'): 0.0, (0, 'learned'): 0.0, (0, 'question'): 14.22656971647795, (0, 'delivered'): 0.0, (0, 'Bombay'): 0.0, (0, 'points'): 0.0, (0, 'persons'): 0.0, (0, 'shall'): 0.0, (0, 'trial'): 0.0, (0, 'subsection'): 0.0, (0, 'submitted'): 0.0, (0, 'allowed'): 0.0, (0, 'person'): 0.0, (0, 'reads'): 0.0, (0, 'held'): 0.0, (0, 'On'): 0.0, (0, 'said'): 10.400879436282183, (0, 'section'): 17.513265528373136, (0, 'Commissioner'): 0.0, (0, 'read'): 0.0, (0, 'JUDGE'): 0.0, (0, 'Judge'): 0.0, (0, 'Penal'): 0.0, (0, 'set'): 0.0, (0, 'Whether'): 0.0, (0, 'judgment'): 0.0, (0, 'TAX'): 0.0, (0, 'decision'): 0.0, (0, 'SCC'): 0.0, (0, 'time'): 21.495924775940054, (0, 'IN'): 0.0, (0, 'income'): 0.0, (0, 'Criminal'): 0.0, (0, 'appeal'): 0.0, (0, 'declaration'): 0.0, (0, 'List'): 0.0, (0, 'also'): 13.267003346943907, (0, 'made'): 8.398922961457709, (0, 'hand'): 0.0, (0, 'J'): 0.0, (0, 'AND'): 0.0, (0, 'Bench'): 0.0, (0, 'cheque'): 0.0, (0, 'COURT

In [14]:
counter = Counter(tf_idf_weights)
highest_weight_100 = counter.most_common(100)
highest_weight_100 = dict(highest_weight_100)

print(highest_weight_100)

{(9, 'THE'): 37.80984168183129, (0, 'PW1'): 36.63456635510056, (3, 'On'): 36.36308278265473, (4, 'Whether'): 33.72547754129168, (3, 'complainant'): 33.581551785980395, (3, 'persons'): 33.48346253536957, (9, 'years'): 33.358335209707356, (9, 'IN'): 32.88269525899075, (6, 'Signature'): 32.742920488967556, (9, 'COURT'): 32.59949583770853, (12, 'dismissed'): 30.751303617983112, (9, 'Sri'): 30.669319229369624, (13, 'person'): 30.669319229369624, (9, 'Mr'): 30.627863545310657, (11, 'imprisonment'): 30.501585522657756, (9, 'AND'): 30.415757962948213, (12, 'allowed'): 29.87056516141703, (9, 'THIS'): 29.822592349365983, (9, 'Incometax'): 29.822592349365983, (6, 'List'): 29.72537119074668, (9, 'JUDGMENT'): 29.626398205588863, (9, 'TAX'): 29.422935979950832, (9, 'AT'): 29.422935979950832, (9, 'HIGH'): 29.422935979950832, (12, 'set'): 29.37087064164692, (8, 'SCC'): 29.318307062872357, (6, 'Judgment'): 29.21164638987467, (9, 'OF'): 29.19296428677815, (6, 'JUDGE'): 28.763001048556358, (0, 'stated'):

In [15]:
highest_weight_words = []
for i in highest_weight_100.keys():
  highest_weight_words.append(i[1])

highest_weight_words = list(set(highest_weight_words))
highest_weight_words.append("ALL CAPS")
print(highest_weight_words)

['delivered', 'points', 'persons', 'shall', 'trial', 'subsection', 'submitted', 'allowed', 'person', 'reads', 'On', 'read', 'JUDGE', 'set', 'Penal', 'Whether', 'TAX', 'SCC', 'IN', 'income', 'List', 'hand', 'J', 'AND', 'COURT', 'must', 'goods', 'accordingly', 'present', 'convicted', 'period', 'K', 'Code', 'opinion', 'Judgment', 'filed', 'Government', 'Sessions', 'company', 'THE', 'AT', 'We', 'imprisonment', 'society', 'N', 'IPC', 'HIGH', 'b', 'entitled', 'PW1', 'JURISDICTION', 'THIS', '4', 'dismissed', 'Indian', 'cooperative', 'Signature', 'Vs', 'fine', 'death', 'PW', 'Incometax', 'proves', 'place', 'stated', 'hereby', 'clause', 'suit', 'failed', 'year', 'sentenced', 'S', 'Learned', 'OF', 'JUDGMENT', 'sentence', 'holding', 'Sd', 'Appeal', 'years', 'aside', 'deceased', 'committed', 'house', 'Mr', 'APPELLATE', 'may', 'one', 'Sri', 'complainant', 'consideration', 'result', 'respondent', 'costs', 'Civil', 'India', 'ALL CAPS']


highest_weight_words = ['delivered', 'points', 'persons', 'shall', 'trial', 'subsection', 'submitted', 'allowed', 'person', 'reads', 'On', 'read', 'JUDGE', 'set', 'Penal', 'Whether', 'TAX', 'SCC', 'IN', 'income', 'List', 'hand', 'J', 'AND', 'COURT', 'must', 'goods', 'accordingly', 'present', 'convicted', 'period', 'K', 'Code', 'opinion', 'Judgment', 'filed', 'Government', 'Sessions', 'company', 'THE', 'AT', 'We', 'imprisonment', 'society', 'N', 'IPC', 'HIGH', 'b', 'entitled', 'PW1', 'JURISDICTION', 'THIS', '4', 'dismissed', 'Indian', 'cooperative', 'Signature', 'Vs', 'fine', 'death', 'PW', 'Incometax', 'proves', 'place', 'stated', 'hereby', 'clause', 'suit', 'failed', 'year', 'sentenced', 'S', 'Learned', 'OF', 'JUDGMENT', 'sentence', 'holding', 'Sd', 'Appeal', 'years', 'aside', 'deceased', 'committed', 'house', 'Mr', 'APPELLATE', 'may', 'one', 'Sri', 'complainant', 'consideration', 'result', 'respondent', 'costs', 'Civil', 'India', 'ALL CAPS']

