In [43]:
import spacy
import re
import os
import PyPDF2 as pdf
from pathlib import Path



class NLPProcessor:
    def __init__(self, model_name="en_core_web_sm"):
        self.nlp = spacy.load(model_name, disable=["parser", "ner"])
        
    def _remove_symbols(self, text):
         text = re.sub(r'[^\w]', ' ', text)
         text = re.sub(' +', ' ', text)
         return text

    def tokenize(self, text):
        doc = self.nlp(text)
        tokens = [token.text for token in doc]
        return tokens

    def remove_stopwords(self, tokens):
        doc = self.nlp(" ".join(tokens))
        filtered_tokens = [token.text for token in doc if not token.is_stop]
        return filtered_tokens

    def lemmatize(self, tokens):
        doc = self.nlp(" ".join(tokens))
        lemmas = [token.lemma_ for token in doc]
        return lemmas
    
    @staticmethod       
    def _filter_text(text, n_words=10):
        if len(text.split()) < n_words:
            return ''
        return text
    
    def preprocess_text(self, text):
        # text = self._remove_symbols(text)
        tokens = self.tokenize(text)
        filtered_tokens = self.remove_stopwords(tokens)
        lemmas = self.lemmatize(filtered_tokens)
        preprocessed_text = " ".join(lemmas)
        # preprocessed_text = self._filter_text(text=preprocessed_text)
        return preprocessed_text


def _clause_extract(text, min_words=30):
    processor = NLPProcessor()
    text_list = [x.strip() for x in text.split('\n\n')]
    para_list = []
    for text in text_list:
        processed_text = processor.preprocess_text(text)
        # print(len(processed_text))
        if len(processed_text) > min_words:
            para_list.append(processed_text)
            
    return para_list

res = _clause_extract(data)
len(res)

16

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [74]:
"""spacy pretrained model downlaod: 
python -m spacy download en_core_web_sm
"""
import spacy
import re
import os
import PyPDF2 as pdf
from pathlib import Path
import json
import sys

from langchain_community.vectorstores.chroma import Chroma
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings



class NLPProcessor:
    def __init__(self, model_name="en_core_web_sm"):
        self.nlp = spacy.load(model_name, disable=["parser", "ner"])
        
    def _remove_symbols(self, text):
         text = re.sub(r'[^\w]', ' ', text)
         text = re.sub(' +', ' ', text)
         return text

    def tokenize(self, text):
        doc = self.nlp(text)
        tokens = [token.text for token in doc]
        return tokens

    def remove_stopwords(self, tokens):
        doc = self.nlp(" ".join(tokens))
        filtered_tokens = [token.text for token in doc if not token.is_stop]
        return filtered_tokens

    def lemmatize(self, tokens):
        doc = self.nlp(" ".join(tokens))
        lemmas = [token.lemma_ for token in doc]
        return lemmas
    
    @staticmethod       
    def _filter_text(text, n_words=10):
        if len(text.split()) < n_words:
            return ''
        return text
    
    def preprocess_text(self, text):
        # text = self._remove_symbols(text)
        tokens = self.tokenize(text)
        filtered_tokens = self.remove_stopwords(tokens)
        lemmas = self.lemmatize(filtered_tokens)
        preprocessed_text = " ".join(lemmas)
        # preprocessed_text = self._filter_text(text=preprocessed_text)
        return preprocessed_text


def clause_extract(text, min_words=30):
    processor = NLPProcessor()
    text_list = [x.strip() for x in text.split('\n\n')]
    para_list = []
    for text in text_list:
        processed_text = processor.preprocess_text(text)
        if len(processed_text) > min_words:
            para_list.append(processed_text)
            
    return para_list


class VectorDB:
    def __init__(self, model_id=None) -> None:
        model_id = model_id if model_id else  "all-MiniLM-L6-v2"
        self.embedding = HuggingFaceEmbeddings(model_name=model_id)

    def _init_db(self, text_list):
        self.db = Chroma.from_texts(text_list, embedding=self.embedding)
        
    def query(self, query, topk=1):
        docs = self.db.similarity_search(query, k=topk)
        if len(docs):
            return docs[0].page_content


def _load_contract(file_name, file_path=None):
    if not file_path:
        file_path = 'contracts'
    with open(os.path.join(file_path, file_name), 'r') as f:
        return f.read()


def _load_json(file_name):
    with open(file_name, 'r') as f:
        json_data = json.loads(f.read())
    return json_data


# if __name__ == '__main__':
    "read the contrat and init the vectordb, get user query, and get similar paragraph, use this to do prompt constraction."
contract_path = 'contracts'
contract_file = os.listdir(contract_path)[0]

text = _load_contract(contract_file)
para_list = clause_extract(text=text)
clause_sim_dict = _load_json('./similar_clause_dict.json')

db = VectorDB()
db._init_db(text_list=para_list)

# this should be clause name, 
# todo:at least we should ensure that this key words should be in the text.
query='Notice Period to Terminate Renewal'
if query not in clause_sim_dict:
    print("The clause: {} is not supported! Stop".format(query))
    sys.exit(-1)
else:
    similar_clauses = clause_sim_dict.get(query, [])
    similar_clauses.append(query)
    # to get the query, should also get some related clause names based on the dictionary.
    query = ' & '.join(similar_clauses)
    related_para = db.query(query=query)
    print(related_para)

   

( ) request SEC amendment registration statement , prospectus , statement additional information effect additional information ; ( b ) event issuance SEC stop order suspend effectiveness registration statement , prospectus , statement additional information effect initiation proceeding purpose ; ( c ) happen event make untrue statement material fact registration statement , prospectus , statement additional information effect require make change registration statement , prospectus , statement additional information order statement mislead ; ( d ) action SEC respect amendment registration statement , prospectus , statement additional information time time file SEC . SECTION 7 . TERM AGREEMENT Section 7.1 Agreement shall continue January 18 , 2022 , shall continue automatically successive annual period end January 18th year , provide continuance specifically approve annually ( ) Fund Board Trustees ( b ) vote majority ( define 1940 Act ) Fund Trustees interested person ( define 1940 Act 

In [55]:
len(data)

23845

In [56]:
para_list = clause_extract(data)

In [57]:
len(para_list)

16

In [60]:
vb = VectorDB()
vb._init_db(para_list)

In [61]:
data[:100]

'DISTRIBUTION AND SERVICES AGREEMENT January 18, 2020 This is to confirm that, in consideration of th'

In [62]:
vb.query(query='DISTRIBUTION AND SERVICES AGREEMENT')

'DISTRIBUTION SERVICES AGREEMENT January 18 , 2020 confirm , consideration agreement hereinafter contain , undersigned , Integrity Short Term Government Fund , ( " Fund " ) , open - end , diversify , management investment company organize series Integrity Funds , Delaware statutory trust , agree Integrity Funds Distributor , LLC , ( " Integrity " ) , shall , period distribution agreement ( " Agreement " ) , principal underwriter share issue Fund , include class share authorize ( " share " ) . section 1 . SERVICES UNDERWRITER Section 1.1 Integrity act principal underwriter distribution Shares cover registration statement , prospectus , statement additional information effect Fund ( " Registration Statement " ) Securities Act 1933 , amend ( " 1933 Act " ) , Investment Company Act 1940 , amend ( " 1940 Act " ) . section 1.2 Integrity agree use good effort solicit order sale Shares public offering price , determined accordance Registration Statement , undertake advertising promotion believ

In [63]:
with open('clauses_support.txt', 'r') as f:
    c_list = f.readlines()
    


In [67]:
p = int(len(c_list) / 10)

r_list = []

for i in range(p):
    r_list.append(c_list[i * 10: (i + 1) * 10])
p

4

In [71]:
for x in r_list:
    print(', '.join([t.replace('\n', '') for t in x]))

Document Name, Parties, Agreement Date, Effective Date, Expiration Date, Renewal Term, Notice Period to Terminate Renewal, Governing Law, Most Favored Nation, Non-Compete
Exclusivity, No-Solicit of Customers, Competitive Restriction Exception, No-Solicit of Employees, Non-Disparagement, Termination for Convenience, Rofr/Rofo/Rofn, Change of Control, Anti-Assignment, Revenue/Profit Sharing
Price Restrictions, Minimum Commitment, Volume Restriction, IP Ownership Assignment, Joint IP Ownership, License Grant, Non-Transferable License, Affiliate License-Licensor, Affiliate License-Licensee, Unlimited/All-You-Can-Eat-License
Irrevocable or Perpetual License, Source Code Escrow, Post-Termination Services, Audit Rights, Uncapped Liability, Cap on Liability, Liquidated Damages, Warranty Duration, Insurance, Covenant Not to Sue


In [72]:
print(','.join([t.replace('\n', '') for t in c_list]))

Document Name,Parties,Agreement Date,Effective Date,Expiration Date,Renewal Term,Notice Period to Terminate Renewal,Governing Law,Most Favored Nation,Non-Compete,Exclusivity,No-Solicit of Customers,Competitive Restriction Exception,No-Solicit of Employees,Non-Disparagement,Termination for Convenience,Rofr/Rofo/Rofn,Change of Control,Anti-Assignment,Revenue/Profit Sharing,Price Restrictions,Minimum Commitment,Volume Restriction,IP Ownership Assignment,Joint IP Ownership,License Grant,Non-Transferable License,Affiliate License-Licensor,Affiliate License-Licensee,Unlimited/All-You-Can-Eat-License,Irrevocable or Perpetual License,Source Code Escrow,Post-Termination Services,Audit Rights,Uncapped Liability,Cap on Liability,Liquidated Damages,Warranty Duration,Insurance,Covenant Not to Sue,Third Party Beneficiary
