In [8]:
pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[K     |████████████████████████████████| 50 kB 381 kB/s eta 0:00:01
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25ldone
[?25h  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=183203 sha256=868e3cde22c9690e9dc85305cfde5b8d471607893121c13a68c7ba72040539f4
  Stored in directory: /home/ojas_d/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
##Class for extracting the company names from case documents

from bs4 import BeautifulSoup
import os
import pandas as pd
import datetime as dt
from pathlib import Path
from pandas import *
from fuzzywuzzy import process

class extractMetadata():
    def __init__(self, fpath):
        with open(fpath, 'r') as f:
            self.soup = BeautifulSoup(f.read(), 'html.parser').find('div', attrs={'class':"judgments"})
        if(self.soup):
            self.case_text = self.soup.text
            
        else:
            self.case_text=''
        

    def null_check(self, soup_item, attr):
        if attr=='text':
            try:
                return soup_item.text
            except:
                return
        else:
            return soup_item.find(attr)


    def get_court(self):
        if(self.soup):
            source = self.soup.find('div', attrs={'class':'docsource_main'})
        else:
            source=None
        
        self.court = self.null_check(source, 'text')
        return self.court

    def get_pre(self):
        
        pre = self.soup.find('pre')
        self.pre = self.null_check(pre, 'text')
        return self.pre

    def get_title(self):
        if(self.soup):
            title = self.null_check(self.soup.find_all('div', attrs={'class': 'doc_title'})[-1], 'text')
        
        else:
            title=None
            
        if(title):
            self.title = title
            small_title, date = title.split(' on ')
            self.petitioner, self.respondent = small_title.split(' vs ')
            date = dt.datetime.strptime(date.strip(), '%d %B, %Y').date()
            self.date = dt.datetime.strftime(date, '%d-%m-%Y')
        else:
            self.title=None
            self.date=None
            self.petitioner=None
            self.respondent=None
        return self.title

    def get_author(self):
        # NOTE: the author could be mentioned in the pretag when listing the CORAM
        
        if(self.soup):
            author = self.null_check(self.soup.find('div', attrs={'class':'doc_author'}), 'text')
        else:
            author=None
            
        
        if author:
            self.author = author.split('Author:')[-1]
        else:
            self.author = None
        return self.author

    def get_bench(self):
        if(self.soup):
            bench = self.null_check(self.soup.find('div', attrs={'class':'doc_bench'}), 'text')
        else:
            bench=None
        if bench:
            self.bench = bench.split('Bench:')[-1]
        else:
            self.bench = None
        return self.bench

    def get_eq_citations(self):
        if(self.soup):
            eq_citations = self.null_check(self.soup.find('div', attrs={'class':'doc_citations'}), 'text')
        else:
            eq_citations=None
        
        if eq_citations:
            self.eq_citations = eq_citations.split('Equivalent citations:')[-1]
        else:
            self.eq_citations = None
        return self.eq_citations

    def get_jud_order(self):
        # make a guess about whether the document is a judgment or an order
        # 0 if ambiguous, 1 if judgment, -1 if order

        self.jud_order = 0
        if any([x in self.case_text for x in ['JUDGMENT', 'JUDGEMENT']]):
            self.jud_order += 1

        if 'ORDER' in self.case_text:
            self.jud_order+=1
        
        return self.jud_order

    def get_citations(self):
        # note: these citations are to both cases and statutes
        if(self.soup):
            self.citations = [x['href'] for x in self.soup.find_all('a')]
        else:
            self.citations=[]
        
        return self.citations
    
    def check_petitioner_firm(self):
        if(self.soup):
            title = self.null_check(self.soup.find_all('div', attrs={'class': 'doc_title'})[-1], 'text')
        
        else:
            title=None
        if(title):
            self.title=title
            small_title, date = title.split(' on ')
            self.petitioner, self.respondent = small_title.split(' vs ')
            petitioner_upper=self.petitioner.upper()
            query=petitioner_upper.split()
            my_file = open("companies_keyword_list.txt", "r")
            firm_names=my_file.read()
            if any(ext in query for ext in firm_names):
                self.petfirm=1
            
            
            else:
                self.petfirm=0
            
        
        else:
            self.petfirm=0
            
            
       
        
    def check_respondent_firm(self):
        if(self.soup):
            title = self.null_check(self.soup.find_all('div', attrs={'class': 'doc_title'})[-1], 'text')
        
        else:
            title=None
        if(title):
            self.title=title
            small_title, date = title.split(' on ')
            self.petitioner, self.respondent = small_title.split(' vs ')
            respondent_upper=self.respondent.upper()
            query=respondent_upper.split()
            my_file = open("companies_keyword_list.txt", "r")
            firm_names=my_file.read() 
            if any(ext in query for ext in firm_names):
                self.resfirm=1
            
            
            else:
                self.resfirm=0
                
            
        else:
            self.resfirm=0
            
            
    def check_petitioner_gov(self):
        title = self.null_check(self.soup.find_all('div', attrs={'class': 'doc_title'})[-1], 'text')
        self.title=title
        small_title, date = title.split(' on ')
        self.petitioner, self.respondent = small_title.split(' vs ')
        petitioner_upper=self.petitioner.upper()
        
        if any(ext in petitioner_upper for ext in govt_names):
            self.petgov=1
            
        else:
            
            self.petgov=0
            
            
    def check_respondent_gov(self):
        title = self.null_check(self.soup.find_all('div', attrs={'class': 'doc_title'})[-1], 'text')
        self.title=title
        small_title, date = title.split(' on ')
        self.petitioner, self.respondent = small_title.split(' vs ')
        respondent_upper=self.respondent.upper()
        
        if any(ext in respondent_upper for ext in govt_names):
            self.resgov=1
            
        else:
            
            self.resgov=0
        
        
    def preprocess(self):
        stopwords=["PRIVATE", "PRIVATELIMITED", "PRIVATELIMITED.", "PRIVATELTD", "LIMITED", "LIMITED(OPC)", "LIMITED,", "LIMITED.", "LIMTED", "LIMTIED","LTD", "LTD,", "LTD.", "LTD.,","PVT.", "PVT.LTD", "PVT.LTD.", "PVTLTD", "PVTLTD."]
        
        
        if self.petfirm:
            
            query=(self.petitioner.upper()).split()
            resultwords  = [word for word in query if word not in stopwords]
            result = ' '.join(resultwords)
            self.petname=result
            
            
        
        
        else:
            self.petname=None
            
        if self.resfirm:
            
            query=(self.respondent.upper()).split()
            resultwords  = [word for word in query if word not in stopwords]
            result = ' '.join(resultwords)
            self.resname=result
            
        else:
            self.resname=None
        
    
    
    def get_all_info(self):
        info = {}

        self.get_citations()
        self.get_jud_order()
        self.get_eq_citations()
        self.get_court()
        self.get_bench()
        self.get_author()
        self.get_title()
        #self.get_pre()
        self.check_petitioner_firm()
        self.check_respondent_firm()
        #self.check_petitioner_gov()
        #self.check_respondent_gov()
        self.preprocess()
        

        info['court'] = self.court
        info['author'] = self.author
        info['title'] = self.title
        info['petitioner'] = self.petitioner
        info['respondent'] = self.respondent
        info['doc_date'] = self.date
        #info['pre'] = self.pre
        info['citations'] = self.citations
        info['eq_citations'] = self.eq_citations
        info['judgment_order'] = self.jud_order
        info['check_petitioner_firm']=self.petfirm
        info['check_respondent_firm']=self.resfirm
        info['pet_name']=self.petname
        info['res_name']=self.resname
        #info['check_petitioner_gov']=self.petgov
        #info['check_respondent_gov']=self.resgov
        return info

            
            


if __name__ == "__main__":
    dir= "/home/ojas_d/WorldBank/delhi/2016"
    c=1
    List=[]
    List1=[]
    for filename in os.listdir(dir):
        
        
        if c==200:
            break
                
        filepath=os.path.join(dir, filename)
        myfile=Path(filepath)  
        #print(filepath)
        case = extractMetadata(filepath)  
        info = case.get_all_info()

        #print(info)
        if info['check_petitioner_firm'] :
            List1.append(info['petitioner'])
            List.append(info['pet_name'])
        
        
        if info['check_respondent_firm']:
            List1.append(info['respondent'])
            List.append(info['res_name'])
        c=c+1   
    #print(List)
   
   

In [2]:
print(c)

7579


In [None]:

import pandas as pd
data = pd.read_excel('/home/ojas_d/WorldBank/Registered_companies/raw_data/Delhi_2016.xlsx')
choices=data['COMPANY_NAME'].tolist()
company_names = data['COMPANY_NAME'].tolist()
#Removing repeated words from company names which might hinder clustering
stopwords1=["PVT","M","S","ORS","ANR","i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "the" ,"whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "t", "can", "will", "just", "don", "should", "now","M/S","ANR","PRIVATE", "PRIVATELIMITED", "PRIVATELIMITED.", "PRIVATELTD", "LIMITED", "LIMITED(OPC)", "LIMITED,", "LIMITED.", "LIMTED", "LIMTIED","LTD", "LTD,", "LTD.", "LTD.,","PVT.", "PVT.LTD", "PVT.LTD.", "PVTLTD", "PVTLTD."]
clean_company_names=[]
i=0;
for company in company_names:
    
    if i==20000:
        break
     
    query=(company.upper()).split()
    resultwords  = [word for word in query if word not in stopwords]
    result = ' '.join(resultwords)
    clean_company_names.append(result)
    i=i+1
    
for i in List:
    clean_company_names.append(i)
print(clean_company_names)
    




In [22]:
len(clean_company_names)

28516

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(clean_company_names)

In [None]:
pip install sparse_dot_topn

In [24]:
print(tf_idf_matrix[0])

  (0, 4513)	0.6743613833261672
  (0, 14571)	0.7384014657883732


In [25]:

import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 0.29068851470947266


In [27]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})






matches_df = get_matches_df(matches, clean_company_names, top=1000)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.sample(20)

Unnamed: 0,left_side,right_side,similairity
141,SEARCH BUILD TECH,SEARCH TECH INDIA,0.814534
231,RAMA SANDESH DEVELOPERS PVT,RAMA SANDESH BUILDWELL,0.812467
827,KULLU VALLEY LEISURE RESORTS PVT,KULLU VALLEY LEISURE RESORTS ...,0.957491
183,SEVEN HEAVEN CONSTRUCTIONS,SEVEN HEAVEN IMPEX,0.841584
310,ABB POWER,ABB INDIA,0.827938
592,ORANGE ASHOK NORTH WIND POWER,ORANGE ASHOK WIND POWER,0.87998
652,K.B. POLYCHEM (INDIA),P D POLYCHEM,0.937767
620,AMBIENCE IMPEX,AMBIENCE INDIA,0.832516
126,RAJ DEVCON,R.S. DEVCON,0.814227
21,VYOM INFRASTRUCTURES,VYOM INFRASTRUCTURES & PROJECTS,0.915835
