<a href="https://colab.research.google.com/github/onlyabhilash/NLP-Code/blob/main/part-9/02_LexNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## LexNLP
In this notebook we demonstrate how to use [LexNLP](https://lexpredict-lexnlp.readthedocs.io/en/latest/).It is an open-source information retrieval and extraction tool for real, unstructured legal text.

In [None]:
# To install only the requirements of this notebook, uncomment the lines below and run this cell

# ===========================

!pip install lexnlp==1.8.0
!pip install textract==1.6.3

# ===========================



In [None]:
# To install the requirements for the entire chapter, uncomment the lines below and run this cell

# ===========================

# try:
#     import google.colab
#     !curl  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch10/ch10-requirements.txt | xargs -n 1 -L 1 pip install
# except ModuleNotFoundError:
#     !pip install -r "ch10-requirements.txt"

# ===========================

In [None]:
#Importing the necessary packages
import textract
from pprint import pprint
import pandas as pd

In [None]:
try:
    from google.colab import files
    # upload 'test-Copy.docx' present in Data folder.
    test_upload = files.upload()
except ModuleNotFoundError:
    print('Not using Colab')

Saving test - Copy.docx to test - Copy.docx


In [None]:
#Convert .docx to .txt
try:
    import google.colab
    text = textract.process('test - Copy.docx').decode('utf-8')
except ModuleNotFoundError:
    text = textract.process('Data/test - Copy.docx').decode('utf-8')

type(text)

str

## Acts

In [None]:
import lexnlp.extract.en.acts
print("List of acts in the document")

data_contract = list(lexnlp.extract.en.acts.get_acts(text))


df = pd.DataFrame(data=data_contract,columns=data_contract[0].keys())

df['Act_annotations'] = list(lexnlp.extract.en.acts.get_acts_annotations(text))


df.head(10)

List of acts in the document


Unnamed: 0,location_start,location_end,act_name,section,year,ambiguous,value,Act_annotations
0,6233,6264,Securities Exchange Act,,1934.0,False,Securities Exchange Act of 1934,"[act] at (6233..6264), loc: en"
1,6346,6377,Securities Exchange Act,,1934.0,False,Securities Exchange Act of 1934,"[act] at (6346..6377), loc: en"
2,9158,9176,Securities Act,,,False,Securities Act.\n\n“,"[act] at (9158..9176), loc: en"
3,15403,15419,Securities Act,,,False,"Securities Act,","[act] at (15403..15419), loc: en"
4,15691,15707,Securities Act,,,False,"Securities Act,","[act] at (15691..15707), loc: en"
5,15806,15821,Securities Act,,,False,Securities Act,"[act] at (15806..15821), loc: en"


## Amounts

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import lexnlp.extract.en.amounts
print()
print("Geting all the amounts present in the contract")
# print(dir(lexnlp.extract.en.amounts))
# print()
amts = list(lexnlp.extract.en.amounts.get_amounts(text))

locs = (list(lexnlp.extract.en.amounts.get_amount_annotations(text)))



data_amts = list(zip(amts,locs))


df = pd.DataFrame(data=data_amts,columns=['Amount','Location in Contract'])
df.head(20)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Geting all the amounts present in the contract


Unnamed: 0,Amount,Location in Contract
0,1933.0,"[amount] at (113..119), loc: en"
1,1000000.0,"[amount] at (604..612), loc: en"
2,19.0,"[amount] at (648..651), loc: en"
3,2.0,"[amount] at (651..654), loc: en"
4,2022.0,"[amount] at (654..660), loc: en"
5,1.0,"[amount] at (829..834), loc: en"
6,1.0,"[amount] at (944..949), loc: en"
7,100000.0,"[amount] at (1054..1062), loc: en"
8,1.0,"[amount] at (1114..1116), loc: en"
9,1.0,"[amount] at (1322..1324), loc: en"


## Dates

In [None]:
import lexnlp.extract.en.dates
print("Dates in the Contract")

data_dates = list(lexnlp.extract.en.dates.get_dates(text))
df = pd.DataFrame(data=data_dates,columns=["Date"])
df.head()

Dates in the Contract


Unnamed: 0,Date
0,2022-02-19


## Money

In [None]:
import lexnlp.extract.en.money
print("Money References in the contract")

data_money = list(lexnlp.extract.en.money.get_money(text))
df = pd.DataFrame(data=data_money,columns=["Amount","Currency"])
df.head()

Money References in the contract


Unnamed: 0,Amount,Currency
0,1000000.0,USD
1,100000.0,USD
2,2.0,USD


## Ratios


In [None]:
import lexnlp.extract.en.ratios
print("Ratios in the Contract")

data_ratios = list(lexnlp.extract.en.ratios.get_ratios(text))
df = pd.DataFrame(data=data_ratios,columns=["Ratios"])
display(df.head())
print("No ratios in the contract")

Ratios in the Contract


Unnamed: 0,Ratios


No ratios in the contract


## Act Definitions

In [None]:
import lexnlp.extract.en.definitions
print("Different ACT definitions in the contract")

data_acts = list(lexnlp.extract.en.definitions.get_definitions(text))
df = pd.DataFrame(data=data_acts,columns=["Acts"])
df.head(20)

Different ACT definitions in the contract


Unnamed: 0,Acts
0,SECURITIES ACT
1,Investor
2,Purchase Amount
3,Conversion Amount
4,Cash-Out Amount
5,Capital Stock
6,Change of Control
7,Converting Securities
8,Dissolution Event
9,Dividend Amount


## Copyrights

In [None]:
import lexnlp.extract.en.copyright
print("References to CopyRights")

data_cp_ann = list(lexnlp.extract.en.copyright.get_copyright_annotations(text))

data_cp = list(lexnlp.extract.en.copyright.get_copyright(text))

df = pd.DataFrame(data=list(zip(data_cp,data_cp_ann)),columns=['CopyRight Annotations','CopyRight data'])
df.head()

References to CopyRights


Unnamed: 0,CopyRight Annotations,CopyRight data
0,"((c), None, Dissolution Event)","Dissolution Event, (3862, 3883)"


## Durations


In [None]:
import lexnlp.extract.en.durations
# print(dir(lexnlp.extract.en.durations))
print("Durations in the Contract")
data_dur = list(lexnlp.extract.en.durations.get_durations(text))
# printd()
data_dur_ann = list(lexnlp.extract.en.durations.get_duration_annotations_list(text))

df = pd.DataFrame(data=list(zip(data_dur,data_dur_ann)),columns=['Durations','Duration Annotation'])
df.head()

Durations in the Contract


Unnamed: 0,Durations,Duration Annotation
0,"(hour, 48.0, 2.0)","[duration] at (17772..17781), loc: en"


## Tokens
Eg: Nouns,verbs, etc

In [None]:
import lexnlp.nlp.en.tokens
import nltk
print("Identifying Nouns,Performing Stemming,lemitization,etc")
print("Noun")
print(list(lexnlp.nlp.en.tokens.get_nouns(text))[:10])
print("Stemming")
print(lexnlp.nlp.en.tokens.get_stem_list(text, stemmer=nltk.stem.lancaster.LancasterStemmer())[:10])
print("")
# print(l/]ist(lexnlp.nlp.en.tokens.get_nouns(text)))
print("Lematization")
print(list(lexnlp.nlp.en.tokens.get_verbs(text, lemmatize=True))[:10])

Identifying Nouns,Performing Stemming,lemitization,etc
Noun
['THIS', 'INSTRUMENT', 'AND', 'ANY', 'SECURITIES', 'ISSUABLE', 'PURSUANT', 'HERETO', 'HAVE', 'NOT']
Stemming
['thi', 'instru', 'and', 'any', 'sec', 'issu', 'pursu', 'hereto', 'hav', 'not']

Lematization
['s', 'describe', 'be', 'agree', 'have', 'modify', 'fill', 'bracket', 'be', 'define']


## Sections

In [None]:
import lexnlp.nlp.en.segments.sections
# dir(lexnlp.nlp.en.segments.sections)
pprint(list(lexnlp.nlp.en.segments.sections.get_sections(text)))

['THIS INSTRUMENT AND ANY SECURITIES ISSUABLE PURSUANT HERETO HAVE NOT BEEN '
 'REGISTERED UNDER THE SECURITIES ACT OF 1933, AS AMENDED (THE “SECURITIES '
 'ACT”), OR UNDER THE SECURITIES LAWS OF CERTAIN STATES.THESE SECURITIES MAY '
 'NOT BE OFFERED, SOLD OR OTHERWISE TRANSFERRED, PLEDGED OR HYPOTHECATED '
 'EXCEPT AS PERMITTED IN THIS SAFE AND UNDER THE ACT AND APPLICABLE STATE '
 'SECURITIES LAWS PURSUANT TO AN EFFECTIVE REGISTRATION STATEMENT OR AN '
 'EXEMPTION THEREFROM.\n'
 '\n'
 '\n'
 '\n'
 '[Company Name]\n'
 '\n'
 '\n'
 '\n'
 'SAFE\n'
 '\n'
 '(Simple Agreement for Future Equity)\n'
 '\n'
 'THIS CERTIFIES THAT in exchange for the payment by Bob Marley (the '
 '“Investor”) of $1000000 (the “Purchase Amount”) on or about 19-02-2022, '
 'Anaxia, a space corporation tesla, issues to the Investor the right to '
 'certain shares of the Company’s Capital Stock, subject to the terms '
 'described below.\n'
 '\n'
 'This Safe is one of the forms available at http://ycombinator.com/docum

## Bigram Distribution

In [None]:
import lexnlp.nlp.en.transforms.tokens
# print(dir(lexnlp.nlp.en.transforms.tokens))

pprint(dict(lexnlp.nlp.en.transforms.tokens.get_bigram_distribution(text)))

{('$', '100000.'): 1,
 ('$', '1000000'): 1,
 ('%', 'of'): 2,
 ('(', '1'): 1,
 ('(', '2'): 1,
 ('(', 'A'): 3,
 ('(', 'B'): 3,
 ('(', 'C'): 1,
 ('(', 'Signature'): 1,
 ('(', 'Simple'): 1,
 ('(', 'THE'): 1,
 ('(', 'a'): 5,
 ('(', 'and'): 1,
 ('(', 'as'): 1,
 ('(', 'b'): 5,
 ('(', 'but'): 1,
 ('(', 'c'): 4,
 ('(', 'd'): 7,
 ('(', 'e'): 3,
 ('(', 'even'): 1,
 ('(', 'except'): 1,
 ('(', 'excluding'): 1,
 ('(', 'f'): 1,
 ('(', 'g'): 1,
 ('(', 'i'): 15,
 ('(', 'ii'): 16,
 ('(', 'iii'): 7,
 ('(', 'in'): 1,
 ('(', 'including'): 3,
 ('(', 'or'): 2,
 ('(', 's'): 2,
 ('(', 'subject'): 1,
 ('(', 'that'): 1,
 ('(', 'the'): 4,
 ('(', 'to'): 1,
 ('(', 'treating'): 1,
 ('(', 'within'): 1,
 ('(', 'without'): 3,
 ('(', 'x'): 1,
 ('(', 'y'): 1,
 (')', ')'): 2,
 (')', ','): 7,
 (')', '.'): 6,
 (')', '.If'): 1,
 (')', ':'): 3,
 (')', ';'): 2,
 (')', 'All'): 1,
 (')', 'Any'): 2,
 (')', 'Dissolution'): 1,
 (')', 'Equity'): 1,
 (')', 'IN'): 1,
 (')', 'In'): 1,
 (')', 'Junior'): 1,
 (')', 'Liquidation'): 1,
 (')