In [1]:
import os
import pickle
import dill
import re
import numpy as np
import pandas as pd
import textract # package to perform ocr and pdf2text ways of pdf parsing
import pdfminer
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from validate_email import validate_email
from collections import Counter

In [2]:
source_path = './data/'
file_list = os.listdir(source_path)
print("Total number of files ",len(file_list))

Total number of files  590


In [3]:
file_list

['JPM_022014_14225.pdf',
 'Deep Industries Ltd - Award of Hiring of Integrated Drilling & Completion Services.pdf',
 'Industry-Map-of-the-Precious-Metals-Sector.pdf',
 'Ambit India Access_Pre-Budget conference call with Ashok Wadhwa_03Jul2014.pdf',
 'ThinkSoft-Corporate-Presentation-Mar-12.pdf',
 '[Kotak] GlaxoSmithKline Consumer, February 9, 2016.pdf',
 'JMFL– TCI EXPRESS -errclub- An Express Delivery Company.pdf',
 'Hotline 93.pdf',
 'BHFC_Q1FY17ResultUpdate_7 Aug 16.pdf',
 'GST_-_PROPITIOUS_WINDS_OF_CHANGE-DEC-15-EDEL.pdf',
 'MaricoKayaEnt.pdf',
 'business_valuation_methods.pdf',
 'GRUH_Profile_June_2014.pdf',
 'Copy of 10 LUXE Cost Sheet.pdf',
 'ICICI Direct- Company Update – Bharat Electronics (Buy) errclub Reiterate.pdf',
 'Finolex%20Industries%20Presentation.pdf',
 'INXW-20170203-MOSL-RU-PG008.pdf',
 'issue-nov_dec0787[1].pdf',
 'Timken India Company Update.pdf',
 '201390164-Prannoy-Roy-s-email-to-Gurumurthy-regarding-Chidambaram-NDTV-s-Rs-5000-crores-money-laundering-scam.pdf',

## Essential utility functions to process pdf data

In [4]:
def save_obj(fname, obj):
    with open(fname, 'wb') as handle:
        dill.dump(obj, handle)

def load_obj(fname):
    with open(fname, 'rb') as handle:
        obj = dill.load(handle)
        return obj

In [5]:
def process_pdfminer(file_list):  ## function to parse pdf data using pdfminer text mining package
    
    r=0
    text_corpus = {}

    for file in file_list:
        print(f'Reading : {file} \n')
        pdf_text = phl.extract_text(source_path+file)
        pdf_text = pdf_text.split('\n')
        pt = [i for i in pdf_text if len(i)>1]

        text_corpus[file] = pt
        r = r + 1

    print(f'Total files read : {r}/{len(file_list)}')
    
    save_obj('./corpus_pdfminer.pkl', text_corpus)

def process_pdfocr_and_pdf2text(file_list):  ## function to pdf data using pdf2text and ocr utility of textract
    
    pdf2text_corpus = {}
    tf = len(file_list)
    f = 1
    for file in file_list:
        try:
            print(f'PDFtoText reading ({f}/{tf}): {file}')
            text = textract.process(source_path+file, method='pdftotext', language='eng', input_encoding=None, output_encoding='utf8')
            text = str(text).replace('\\x','')
            texts = str(text).replace('\\n','')
            texts = texts.split('.')
            texts = [x for x in texts if len(x)>1]
            pdf2text_corpus[file] = texts
            f = f + 1
        except:
            continue

    save_obj('pdftotext_corpus.pkl', pdf2text_corpus)

    ocr_corpus = {}
    for file in file_list:
        print("OCR reading : ",file)
        ## each pdf page is turned to image and ocr perfomed on each image using pytesseract
        text = textract.process(source_path+file, method='tesseract', language='eng')  
        text = str(text).replace('\\x','')
        texts = str(text).split('\\n')
        texts = [x for x in texts if len(x)>1]
        ocr_corpus[file] = texts

    save_obj('ocr_tesseract_corpus.pkl', ocr_corpus)

In [6]:
## function to find BSE listed company names and symbols

def process_bse_companies():
    
    bse = pd.read_csv('./bse_companies.csv', encoding='latin-1')
    print(bse.head())

    nse_symbols = list(bse['CD_NSE Symbol'])
    bb_symbols = list(bse['CD_Bloomberg Code'])
    com_names = list(bse['Company Name'])

    nse_symbols = [x for x in nse_symbols if str(x)!='nan']
    bb_symbols = [x for x in bb_symbols if str(x)!='nan']
    
    com_names = [x for x in com_names if str(x)!='nan']
    # print(nse_symbols, bb_symbols)

    all_symbols = [*nse_symbols, *bb_symbols]

    company_symbols = set()

    for sym in all_symbols:
        company_symbols.add(sym)

    save_obj('company_symbols.pkl', company_symbols)
    save_obj('company_names.pkl', com_names)
    
def get_bse_companies():
    return load_obj('company_symbols.pkl')

In [8]:
## function to store author names found in the metadata of each pdf file

def get_authors(file_list):
    
    r=0
    metadata_corpus = {}

    for file in file_list:
        print(f'\n Reading : {file} \n')
        fp = open(source_path+file, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        #print("\n",doc.info[0].keys())
        try:
            if('Author' in doc.info[0].keys()):
                print(doc.info[0]['Author'])
                metadata_corpus[file] = doc.info[0]['Author']
                r = r + 1
        except:
            print("skipped")

    print(f'Total files metadata read : {r}/{len(file_list)}')
    save_obj('author_corpus_pdfminer.pkl', metadata_corpus)

## Loading artificats pre-processed from pdf files

In [11]:
text_corpus = load_obj('./corpus_pdfminer.pkl')
#pdftotext_corpus = load_obj('./pdftotext_corpus.pkl')
ocr_corpus = load_obj('ocr_tesseract_corpus.pkl')
company_symbols = list(load_obj('company_symbols.pkl'))
company_names = list(load_obj('company_names.pkl'))
author_info = load_obj('./author_corpus_pdfminer.pkl')

## Functions to process email id and extract authors / institutions

In [12]:
def find_email(texts):  ## function to find email ids in the extracted text
    emails_found = set()
    for i in range(len(texts)):
        if(bool(validate_email(texts[i]))):  ## using validate_email package to find valid email id
            emails_found.add(texts[i].strip())
    
    if(len(emails_found)==0):  ## otherwise, using @ based string processing to identify email id
        for t in texts:
            if(t.find('@')>0):
                parts = t.split(' ')
                for p in parts:
                    if '@' in p:
                        emails_found.add(p.strip())
                        
    return list(emails_found)

def find_companies(texts):  ## function to find company symbols/names by comparing with BSE listed comapnies
    #print(company_symbols)
    companies_found = set()
    company_names_found = set()
    for i in range(len(texts)):
        for j in range(len(company_symbols)):
            if(texts[i].startswith(company_symbols[j])):
                companies_found.add(company_symbols[j])
        for k in range(len(company_names)):
            if(texts[i].startswith(company_names[k])):
                company_names_found.add(company_names[k])
                
    companies_found = list(companies_found)
    companies_found = [x for x in companies_found if x!=' ']
    companies_found = [x for x in companies_found if x!='BSE']
    
    company_names_found = list(company_names_found)
    company_names_found = [x for x in company_names_found if x!=' ']
    company_names_found = [x for x in company_names_found if x!='Net Current Asset']
    
    return companies_found, company_names_found

In [13]:
def split_emails(emails):  ## function to split email to find author and source organisations
    if emails:
        names = set()
        orgs = list()
        for i in range(len(emails)):
            eid = emails[i]
            esplit = eid.split('@')
            #print(esplit)
            if(len(esplit[0])>2):
                if(esplit[0][0]=='('):
                    nm = esplit[0][1:]
                else:
                    nm = esplit[0]
                names.add(nm)
            else:
                continue
            orgsplit = esplit[1].split('.')
            orgs.append(orgsplit[0])

        names = list(names)
        #print(orgs)
        if(orgs):
            main_org = Counter(orgs).most_common(1)[0][0]
        else:
            main_org = None
            
        return names, main_org, list(set(orgs))

In [15]:
def get_frequent_source(texts):  ## function to find source organisation from extracted text
    
    doc_owner = []
    for i in range(len(texts)):
        if(texts[i].startswith('Source') or texts[i].startswith('Company') or texts[i].startswith('Source Company')):
            #print(texts[i])
            doc_owner.append(texts[i])
    if(doc_owner):
        main_doc_owner = Counter(doc_owner).most_common(1)[0][0]
        #print(main_doc_owner)
        if(main_doc_owner.find(':')):
            mdo = main_doc_owner.split(':')[1]
        if(mdo.find(',')>=0):
            mdo = mdo.split(',')[1]
        return mdo.strip()
    

## Functions to extract trade calls and target prices

In [16]:
def extract_price(st):
    
    prices = []
    main_price = None
    
    for sent in st:
        
        #print(sent)
        p = ''
        if(sent.find('TP')>=0):
            sent = sent.split('TP')[1]
            
        if(sent.find('INR')>=0):
            a = sent.split('INR')
            p = a[1].split(' ')[0]
            if(p.isdigit()):
                prices.append(p)
        if(sent.find('Rs. ')>=0):
            a = sent.split('Rs. ')
            p = a[1].split(' ')[0]
            if(p.isdigit()):
                prices.append(p)
        if(sent.find('Rs')>=0):
            #print(sent)
            a = sent.split('Rs')
            p = a[1].split(' ')[0]
            if(p.isdigit()):
                prices.append(p)
        if(p==''):
            for i in sent.split(' '):
                if(i.isdigit()):
                    prices.append(i)
    
    #print(calls, prices)
    if(prices):
        main_price = Counter(prices).most_common(1)[0][0]
        return float(main_price)
    
    return main_price

def extract_call_price(pt):
    
    #print(pt)
    calls = []
    prices = []
    main_call, main_price = None, None
    
    for tup in pt:
        
        call = tup[0]
        calls.append(call)
        sent = tup[1]
        p = ''
        
        if(sent.find('TP')>=0):
            sent = sent.split('TP')[1]
            
        if(sent.find('INR')>=0):
            a = sent.split('INR')
            p = a[1].split(' ')[0]
            if(p.isdigit()):
                prices.append(p)
        if(sent.find('Rs. ')>=0):
            a = sent.split('Rs. ')
            p = a[1].split(' ')[0]
            if(p.isdigit()):
                prices.append(p)
        if(sent.find('Rs')>=0):
            #print(sent)
            a = sent.split('Rs')
            p = a[1].split(' ')[0]
            if(p.isdigit()):
                prices.append(p)
        if(p==''):
            for i in sent.split(' '):
                if(i.isdigit()):
                    prices.append(i)
            for i in sent.split(':'):
                if(i.isdigit()):
                    prices.append(i)
    
    #print(calls, prices)
    if(calls and prices):
        main_call = Counter(calls).most_common(1)[0][0]
        main_price = Counter(prices).most_common(1)[0][0]
        return main_call, float(main_price)
    
    return main_call, main_price

def get_trade_call(texts):
    
    main_call, main_price = None, None
    calls = []
    only_calls = []
    primary_targets = []
    secondary_targets = []
    prime_call = None
    
    for i in range(len(texts)):
        
        if(('Buy' in texts[i].strip()) or ('buy' in texts[i].strip())):
            calls.append((texts[i],'Buy'))
            
        if(('Sell' in texts[i].strip()) or ('sell' in texts[i].strip())):
            calls.append((texts[i],'Sell'))
            
        if((('Target' in texts[i].strip() or 'target' in texts[i].strip()) or ('Price' in texts[i].strip() or 'price' in texts[i].strip())) or ('TP' in texts[i].strip())):
            secondary_targets.append(texts[i])
            
        if('Rating' in texts[i].strip() or 'rating' in texts[i].strip()):
            if('Buy' in texts[i].strip() or 'buy' in texts[i].strip()):
                prime_call = 'Buy'
            if('Sell' in texts[i].strip() or 'sell' in texts[i].strip()):
                prime_call = 'Sell'
            #print("\nPrime Call : ",prime_call)
            
    #print(calls)
    
    if(calls):
        for c in calls:
            if(c[0].find('target')>=0 and c[0].find('price')>=0):
                primary_targets.append((c[1], c[0]))
            else:
                if(c[0].find('target')>=0 or c[0].find('TP')>=0):
                    secondary_targets.append(c[1])
                    
            only_calls.append(c[1])
            
    
    #print("Primary targets : \n", primary_targets)
    #print("Secondary targets : \n", secondary_targets)
    
    if(primary_targets):
        main_call, main_price = extract_call_price(primary_targets)
        if(prime_call):
            main_call = prime_call
        return main_call, main_price

    if(secondary_targets):
        main_price = extract_price(secondary_targets)
    
    if(only_calls):
        main_call = Counter(only_calls).most_common(1)[0][0]
        
    if(prime_call):
        main_call = prime_call
    
    if(main_call and main_price):
        return main_call, main_price
    else: 
        return None, None
    

## Final Loop to create output csv

In [31]:
df = pd.DataFrame(columns = ['file', 'source company', 'authors', 'company symbols', 'company names', 'trade call', 'target price', 'possible sources'])
df = df.astype('object')

In [32]:
f_num = 1
for f in file_list:
    
    print(f, f"  -  ({f_num}/{len(file_list)})")
    
    ocr_texts = ocr_corpus[f]
    miner_texts = text_corpus[f]
        
    ocr_texts = [x.replace('\xa0', ' ') for x in ocr_texts]
    miner_texts = [x.replace('\xa0', ' ') for x in miner_texts]
    
    emails = find_email(miner_texts)
    if(len(emails)==0):
        emails = find_email(ocr_texts)
        
    comps, comp_names = find_companies(miner_texts)
    if(len(comps)==0):
        comps, comp_names = find_companies(ocr_texts)
        
    try:
        met_auth = author_info[f].decode('utf-8')
    except:
        met_auth = None
        
    #print(emails, comps, met_auth)

    persons, sources, possible_sources = [], [], []
    try:
        persons, sources, possible_sources = split_emails(emails)
    except TypeError:
        persons, sources = [], []

    try:
        frequent_source = get_frequent_source(miner_texts)
    except:
        frequent_source = None
    #print(possible_sources)

    if(frequent_source):
        all_sources = [*possible_sources, frequent_source]
    else:
        all_sources = possible_sources

    #print("\n", persons, sources, all_sources)
    #print("\n", comps)
    
    #finalizing authors by filtering out all possible options
    doc_authors = []
    if(persons):
        doc_authors = persons
    if((met_auth and len(doc_authors)==0)):
        doc_authors.append(met_auth)
    if(len(doc_authors)>6):
        doc_authors = []
        doc_authors.append(met_auth)
    if(len(doc_authors)==0):
        doc_authors = []
        doc_authors.append('NA')

        
    # finalizing authoring institution by filtering out all possible options
    source_companies = []
    if(sources and frequent_source):
        if(sources in frequent_source):
            source_companies.append(frequent_source)
    if(sources and len(source_companies)==0):
        source_companies.append(sources)
    if(frequent_source and len(source_companies)==0):
        source_companies.append(frequent_source)
    if(len(source_companies)==0):
        source_companies.append('Others')

    print("\n",source_companies, "\n", doc_authors, "\n", comps, "\n", comp_names,"\n")
    
    # get possible trade calls and target price
    mc, mp = get_trade_call(ocr_texts) 
    if(mp==None):
        _, mp = get_trade_call(miner_texts)
        
    print(f"Call : {mc}  -  Price : {mp}\n")
    
    # append row to dataframe
    current_col = [f, source_companies, doc_authors, comps, comp_names, mc, mp, all_sources]
    df.loc[len(df)] = current_col
    
    #print(df.tail(1))
    
    f_num = f_num + 1

JPM_022014_14225.pdf   -  (1/590)

 ['jpmorgan'] 
 ['deepika.mundra', 'sumit.x.kishore', 'boris.cw.kan'] 
 ['BHEL', 'KEC', 'ABB'] 
 ['Apar Industries Ltd.', 'Triveni Turbine Ltd.', 'Bilpower Ltd.', 'Finolex Cables Ltd.', 'Voltamp Transformers Ltd.', 'Suzlon Energy Ltd.', 'Siemens Ltd.', 'Diamond Power Infrastructure Ltd.', 'Schneider Electric Infrastructure Ltd.', 'Universal Cables Ltd.'] 

Call : Sell  -  Price : 20.0

Deep Industries Ltd - Award of Hiring of Integrated Drilling & Completion Services.pdf   -  (2/590)

 ['deepindustrtes'] 
 ['info'] 
 [] 
 [] 

Call : None  -  Price : None

Industry-Map-of-the-Precious-Metals-Sector.pdf   -  (3/590)

 ['Others'] 
 ['john'] 
 [] 
 [] 

Call : None  -  Price : None

Ambit India Access_Pre-Budget conference call with Ashok Wadhwa_03Jul2014.pdf   -  (4/590)

 ['ambitcapital'] 
 [None] 
 [] 
 [] 

Call : None  -  Price : None

ThinkSoft-Corporate-Presentation-Mar-12.pdf   -  (5/590)

 ['thinksoftglobal'] 
 ['dpingle', 'vaidyanathan.n'] 
 ['


 ['ubs'] 
 ['william.vanderpump'] 
 ['NMDC', 'SAIL', 'TI'] 
 [] 

Call : Buy  -  Price : 2015.0

Bigbloc Const.  Investor presentation.pdf   -  (43/590)

 ['sgapl'] 
 ['cs_sumit'] 
 ['TI'] 
 [] 

Call : None  -  Price : None

VIP 1QFY17.pdf   -  (44/590)

 ['sparkcapital'] 
 ['tejash', 'madhav', 'gnanasundar'] 
 ['GNA', 'TOTAL', 'VIP IN', 'MADHAV'] 
 [] 

Call : Buy  -  Price : 131.0

UPLL-20181026-MOSL-RU-PG012.pdf   -  (45/590)

 ['MotilalOswal'] 
 ['Aksh.Vashishth', 'Sumant.Kumar'] 
 ['UPLL IN', 'UPL'] 
 [] 

Call : Buy  -  Price : 749.0

WHIRL-20171127-MOSL-CU-PG006.pdf   -  (46/590)

 ['MotilalOswal'] 
 ['Ankur.VSharma', 'Amit.Shah'] 
 ['WHIRL IN'] 
 [] 

Call : None  -  Price : None

Astral Poly.pdf   -  (47/590)

 ['antiquelimited'] 
 ['nehal.shah'] 
 ['ASTRA IN'] 
 [] 

Call : None  -  Price : None

IceCap Asset Management Limited Global Markets November 2011.pdf   -  (48/590)

 ['IceCapAssetManagement'] 
 ['keithdicker'] 
 ['DOLLAR'] 
 [] 

Call : None  -  Price : None

http_


 ['motilaloswal'] 
 ['Nikhil.Gupta', 'Rahul.Agrawal'] 
 ['HUDCO'] 
 [] 

Call : None  -  Price : None

Balkrishna Inds quant.pdf   -  (93/590)

 ['Others'] 
 ['NA'] 
 ['PEL'] 
 [] 

Call : None  -  Price : None

EY-Tax-Alert-key-proposals-of-the-Direct-Taxes-Code-2010%20.pdf   -  (94/590)

 ['Others'] 
 ['sudhir.masuti'] 
 [] 
 [] 

Call : None  -  Price : None

Millennials to Anchor India's Growth Story.pdf   -  (95/590)

 ['morganstanley'] 
 [None] 
 ['HDFC', 'TOTAL'] 
 [] 

Call : Buy  -  Price : 12.0

Goodyear+India+-+Stock+Note.pdf   -  (96/590)

 ['hdfcsec'] 
 ['hdfcsecretailresearch'] 
 ['M&M', 'GDYR IN', 'GOODYEAR', 'HDFC'] 
 ['Cash & Cash Equivalent', 'Goodyear India Ltd.'] 

Call : None  -  Price : None

UPL (1QFY19) - HDFC sec.pdf   -  (97/590)

 ['hdfcsec'] 
 ['archit.joshi', 'nilesh.ghuge'] 
 ['UPLL IN', 'TOTAL', 'LT', 'UPL', 'HDFC'] 
 [] 

Call : None  -  Price : None

ESIB Bajaj Finserve1.pdf   -  (98/590)

 ['espiritosantoib'] 
 ['Santosh Singh, CFA'] 
 ['BJFIN IN', 'A


 ['emkayglobal'] 
 ['raghunandhan.nl', 'bibhishan.jagtap'] 
 ['ACC'] 
 ['Emkay Global Financial Services Ltd.'] 

Call : None  -  Price : None

I-Sec_Mn150517.pdf   -  (138/590)

 ['icicisecurities'] 
 ['715370'] 
 ['ABFRL', 'PNB', 'GSPL', 'HMVL', 'MHRIL', 'OCL', 'IOC', 'HDFC', 'HSIL', 'ICICI', 'GAIL', 'OIL', 'ATFL', 'BEL', 'CONCOR', 'ZEEL', 'CESC', 'BPCL', 'MCX'] 
 ['Canara Bank'] 

Call : Buy  -  Price : 3.0

AMRJ-20170123-MOSL-RU-PG010.pdf   -  (139/590)

 ['motilaloswal'] 
 ['Varun.kumar'] 
 ['ICICI', 'M&M', 'AMAR', 'AMRJ IN'] 
 [] 

Call : Buy  -  Price : None

BAFL QIP - List of allottees-11062015.pdf   -  (140/590)

 ['Others'] 
 ['NA'] 
 [] 
 [] 

Call : None  -  Price : None

Wabco India 2QFY18 Outlook Review.pdf   -  (141/590)

 ['sparkcapital'] 
 ['rseshan', 'mukesh'] 
 ['WIL IN', 'TOTAL', 'STAN'] 
 [] 

Call : Buy  -  Price : 173.0

3TUHR8XDFPGYI62CJ5MQK14LSW7EBO.pdf   -  (142/590)

 ['Others'] 
 ['NA'] 
 [] 
 [] 

Call : None  -  Price : None

EIL_IDFC - July12 .pdf   -  

AmbitCap-Hathway-Nov19-2013.pdf   -  (186/590)

 ['y,'] 
 ['athw', 'hathw'] 
 [] 
 [] 

Call : None  -  Price : None

vhf-interim-guidance.pdf   -  (187/590)

 ['Others'] 
 ['bzw8'] 
 [] 
 [] 

Call : None  -  Price : None

The Option Trader Handbook - Strategies And Trade Adjustments.pdf   -  (188/590)

 ['  $45'] 
 ['YHOO  ', 'EBAY  ', 'IBM  '] 
 ['TI'] 
 [] 

Call : Sell  -  Price : 100.0

Cera Presentation.pdf   -  (189/590)

 ['gmail'] 
 ['ankurjain2100'] 
 ['MTNL', 'CERA'] 
 ['Cash & Cash Equivalent'] 

Call : Buy  -  Price : 42.0

Alcoholic_Beverages_Jun162.pdf   -  (190/590)

 ['icicisecurities'] 
 ['anand.mour', 'Aniruddha.joshi'] 
 ['ICICI'] 
 [] 

Call : None  -  Price : None

AXSB-20171113-MOSL-CU-PG008.pdf   -  (191/590)

 ['MotilalOswal'] 
 ['Alpesh.Mehta', 'Piran.Engineer', 'Nitin.Aggarwal', 'Anirvan.Sarkar'] 
 ['AXSB IN'] 
 [] 

Call : Buy  -  Price : 680.0

IDEA-20170213-MOSL-RU-PG012.pdf   -  (192/590)

 ['motilaloswal'] 
 ['Varun.kumar'] 
 ['ICICI', 'IDEA IN', 'IDEA'


 ['Others'] 
 ['Unknown'] 
 ['PSB'] 
 [] 

Call : Sell  -  Price : 1.0

Kotak UBS 270417.pdf   -  (229/590)

 ['ubs'] 
 ['ishank.kumar', 'vishal.goyal'] 
 [] 
 [] 

Call : Buy  -  Price : 1050.0

Beta Drugs Ltd.pdf   -  (230/590)

 ['betadrugslimited'] 
 ['info', 'ipo'] 
 ['BETA'] 
 [] 

Call : None  -  Price : None

Cyient Analyst day_final.pdf   -  (231/590)

 ['sparkcapital'] 
 ['aishwariya', 'srivathsan'] 
 ['CYL IN', 'CYIENT'] 
 ['Cyient Ltd.'] 

Call : None  -  Price : None

cummins oct 2014.pdf   -  (232/590)

 ['ilfsindia'] 
 ['devang.patel'] 
 ['KKC IN', 'BHEL', 'LT'] 
 [] 

Call : Buy  -  Price : 12.0

CSFEb15.pdf   -  (233/590)

 ['credit-suisse'] 
 [None] 
 ['IOB IN', 'NEST IN', 'DBL IN', 'BHIN IN', 'FCL', 'NH', 'IOB', 'HZ IN', 'DBL', 'SUNP IN', 'MSS IN', 'KSE'] 
 ['Indian Overseas Bank'] 

Call : Sell  -  Price : 18.0

Table-1.pdf   -  (234/590)

 ['Ace Equity'] 
 ['NA'] 
 ['DLF', 'IRB', 'DOLLAR'] 
 [] 

Call : None  -  Price : None

IIFL-+Titan-Gold+standard-ADD.pdf   - 


 ['sparkcapital'] 
 ['vishnu', 'ganeshram'] 
 ['PTC', 'LT'] 
 [] 

Call : Buy  -  Price : 3.0

coal india kotak may2017.pdf   -  (276/590)

 ['kotak'] 
 ['kotak.research', 'murtuza.arsiwalla'] 
 ['MCL'] 
 [] 

Call : Buy  -  Price : 2017.0

Avendus- Housing Financials Concerns on competition are overdone.pdf   -  (277/590)

 ['avendus'] 
 ['jaynee.shah'] 
 ['ICICI', 'LICHF IN', 'HDFC IN', 'HDFC'] 
 [] 

Call : Buy  -  Price : 332.0

mayur2.pdf   -  (278/590)

 ['firstcallindiaequity'] 
 ['info'] 
 ['ACC'] 
 [] 

Call : None  -  Price : None

DepositBasedMem.pdf   -  (279/590)

 ['Others'] 
 ['MAXXCREATIVE1'] 
 ['TOTAL'] 
 [] 

Call : None  -  Price : None

Enam_Q3_FY11.pdf   -  (280/590)

 ['enam'] 
 ['nitin.idnani', 'chirag.negandhi', 'kunal.lakhan'] 
 ['HDIL'] 
 [] 

Call : Sell  -  Price : 150.0

KPIT-20170427-MOSL-RU-PG012.pdf   -  (281/590)

 ['motilaloswal'] 
 ['Varun.kumar'] 
 ['KPIT IN'] 
 [] 

Call : Buy  -  Price : 14.0

CRISIL-Research_ier-report-infinite-2014.pdf   -  (282


 ['vsnl'] 
 ['Sunil Terkar'] 
 ['IDBI', 'ICICI', 'SOUNDCRAFT', 'IDFC', 'HUDCO'] 
 [] 

Call : Buy  -  Price : 15.0

EIL Annual PPT - FY 13-14 - May 2014 - Rev 2 - Final1.pdf   -  (328/590)

 ['Planning Commission'] 
 ['8858'] 
 ['ONGC', 'HINDALCO', 'IOC', 'BPCL', 'PETRONET'] 
 [] 

Call : Buy  -  Price : 32.0

BEML Ltd.pdf   -  (329/590)

 ['10'] 
 ['Ded'] 
 ['PEL'] 
 [] 

Call : None  -  Price : None

eq_HAVELLS_upd9_10Jun2013.pdf   -  (330/590)

 ['crisil'] 
 ['clientservicing'] 
 ['CRISIL'] 
 [] 

Call : None  -  Price : None

IWL- Consolidated -Published Results Format-30062014.pdf   -  (331/590)

 ['Others'] 
 ['Manoj Agrawal'] 
 [] 
 [] 

Call : None  -  Price : None

BoAML on Cummins.pdf   -  (332/590)

 ['baml'] 
 ['sanjaya.satapathy'] 
 ['SRCM IN', 'DLFU IN', 'BJH IN', 'ABNL IN', 'SOBHA IN', 'MSS IN', 'GRASIM', 'CHMB IN', 'KKC IN', 'ACEM IN', 'ARCP IN', 'UT IN', 'GRASIM IN', 'SOBHA', 'DLF', 'BRCM IN', 'SINT IN', 'JSAW IN', 'SHRS IN', 'ACC IN', 'NFCL IN', 'SINTEX IN', 'ICEM IN


 ['pinebridge'] 
 ['info'] 
 [] 
 [] 

Call : None  -  Price : None

IncaseyoumisseditNA161216xe261733.pdf   -  (366/590)

 ['macquarie'] 
 ['christine.farkas'] 
 ['BASF', 'GLOBAL'] 
 [] 

Call : Buy  -  Price : 10.0

Havells India Q1FY13 Result Update.pdf   -  (367/590)

 ['emkayglobal'] 
 ['pritesh.chheda', 'prashant.kutty'] 
 [] 
 ['Emkay Global Financial Services Ltd.'] 

Call : Buy  -  Price : 15.0

Maize and Sugarcane GM crops and markets in Selected African Countries_2014.pdf   -  (368/590)

 ['frost'] 
 ['Carolyn.Krynauw', 'Avril.Harvey', 'Mani.James'] 
 ['UCL'] 
 [] 

Call : Buy  -  Price : 130.0

Ashok_Leyland_-_2QFY18_-_HDFC_sec.pdf   -  (369/590)

 ['hdfcsec'] 
 ['sneha.prashant', 'abhishekkumar.jain'] 
 ['M&M', 'AL IN', 'TOTAL', 'LT', 'HDFC'] 
 [] 

Call : Sell  -  Price : 141.0

Cipla_Q1FY18_results.pdf   -  (370/590)

 ['icicisecurities'] 
 ['vinay.bafna', 'Sriraam.rathi'] 
 ['CIPLA', 'ICICI'] 
 [] 

Call : Buy  -  Price : 479.0

ALPM-20160729-MOSL-RU-PG010.pdf   -  (37


 ['Others'] 
 ['Punit Jani'] 
 ['TCS'] 
 [] 

Call : None  -  Price : None

Dynamatic+Technologi...R_QuarterlyUpdateFirstCut.pdf   -  (413/590)

 ['crisil'] 
 ['sguruprasad', 'clientservicing', 'udshah'] 
 ['DYNAMATECH', 'CRISIL'] 
 [] 

Call : None  -  Price : None

[Kotak] Infrastructure, September 1, 2016.pdf   -  (414/590)

 ['kotak'] 
 ['harish.bihani', 'aditya.mongia', 'kotak.research', 'ajinkya.bhat'] 
 [] 
 [] 

Call : None  -  Price : 12.0

Deutsche Bank- Indraprastha Gas Alert -Visibility for growth remains weak beyond FY17, maintain Hold.pdf   -  (415/590)

 ['db'] 
 ['harshad.katkar', 'amit.murarka'] 
 ['GAIL', 'IGL', 'IGL IN'] 
 [] 

Call : None  -  Price : None

Bharti Infratel Buyback.pdf   -  (416/590)

 ['bharti-infratel'] 
 ['karl.sahukar', 'project.infratelbuyback', ':www.bharti-infratel.com;\tEmail:\tcompliance.officer', 'compliance.officer', 'infratelbuybackoffer'] 
 ['NH', 'BHARTI IN', 'VASA', 'INFRATEL', 'BATLIBOI'] 
 [] 

Call : Buy  -  Price : 2016.0

Castrol+


 ['ubs'] 
 ['william.vanderpump', 'ishank.kumar', 'vishal.goyal'] 
 ['ICICI', 'PNB', 'OCL', 'SPECTRUM', 'PSL', 'SAIL', 'TI'] 
 [] 

Call : Buy  -  Price : None

investment_strategy3.pdf   -  (449/590)

 ['www.archives.gov'] 
 ['megesko'] 
 [] 
 [] 

Call : None  -  Price : None

ar_jyothy.pdf   -  (450/590)

 ['rathi'] 
 ['shirishpardeshi', 'aniruddhajoshi1'] 
 ['ITC', 'JYL IN'] 
 [] 

Call : Buy  -  Price : 210.0

tubeinstallation_tools.pdf   -  (451/590)

 ['Others'] 
 ['NA'] 
 ['PNEUMATIC', 'SIL'] 
 [] 

Call : None  -  Price : None

Double_shift_secondary_schools_En01.pdf   -  (452/590)

 ['worldbank'] 
 ['eservice', 'e28098'] 
 [] 
 [] 

Call : None  -  Price : None

AnchorErngGrwth_25112015.pdf   -  (453/590)

 ['kimiandpartners'] 
 ['krishnaraj.v'] 
 [] 
 [] 

Call : None  -  Price : None

CLSA G & F 2015.pdf   -  (454/590)

 ['clsa'] 
 ['christopher.wood'] 
 [] 
 [] 

Call : None  -  Price : None

BHFC-20171108-MOSL-RU-PG010.pdf   -  (455/590)

 ['MotilalOswal'] 
 ['Deep.Shah'


 ['idfc'] 
 ['santosh.fernandes'] 
 ['HDFC', 'IDFC', 'ICICI'] 
 ['Punjab National Bank'] 

Call : None  -  Price : None

Jubiliant+Foodworks.pdf   -  (493/590)

 ['clsa'] 
 ['compliance_hk', 'jaibir.sethi'] 
 ['JUBI IN'] 
 [] 

Call : None  -  Price : None

IBA_PF_presentation_270407.pdf   -  (494/590)

 ['luthra'] 
 ['msaraf'] 
 ['NH'] 
 [] 

Call : None  -  Price : None

ShowNews.pdf   -  (495/590)

 ['rathi'] 
 ['shirishpardeshi', 'aniruddhajoshi1'] 
 ['LLL IN'] 
 [] 

Call : Buy  -  Price : 498.0

BNK_051616_25242.pdf   -  (496/590)

 ['bksec'] 
 ['ranjit.cirumalla'] 
 [] 
 [] 

Call : Buy  -  Price : 27.0

Mahindra and Mahindra Financial Services-result update-Jul-13-EDEL.pdf   -  (497/590)

 ['edelweissfin'] 
 ['kunal.shah', 'prakhar.agarwal', 'nischal.maheshwari', 'vikas.khemani', 'nirav.sheth', 'nilesh.parikh'] 
 ['HDFC', 'IDFC', 'ICICI'] 
 ['Union Bank Of India', 'Allahabad Bank', 'Oriental Bank Of Commerce', 'Punjab National Bank'] 

Call : Buy  -  Price : 165.0

Vinyl India


 ['sundarambnpparibas'] 
 ['balajiv'] 
 ['FACT', 'MAKE', 'ACC', 'PREMIER', 'ICICI', 'HAL', 'SUMIT', 'PEL', 'CHI', 'HDFC', 'NTPC', 'BHEL', 'TI'] 
 [] 

Call : Buy  -  Price : 2010.0

brent wti crude wxplained.pdf   -  (533/590)

 ['Others'] 
 ['NA'] 
 ['MAKE'] 
 [] 

Call : Buy  -  Price : 5.0

BHEL-20150303-MOSL-CU-PG018.pdf   -  (534/590)

 ['motilaloswal'] 
 ['reports', 'kadambari.balachandran', 'Shah(Amit.Shah', 'Agarwal(AgarwalS', 'anosh.Koppikar'] 
 ['BHEL IN', 'BHEL', 'NTPC'] 
 [] 

Call : Buy  -  Price : 320.0

Indian Gas Sector - Citi .pdf   -  (535/590)

 ['citi'] 
 ['saurabh.handa', 'sohini1.banerjee'] 
 ['GSPL', 'GAIL'] 
 [] 

Call : Buy  -  Price : 21.0

DELTA-20140918-MOSL-SL-PG032.pdf   -  (536/590)

 ['motilaloswal'] 
 ['reports', 'kadambari.balachandran', 'Niket.Shah', 'Email:anosh.Koppikar', 'Atul.Mehra'] 
 ['AMBER', 'MDL'] 
 [] 

Call : Buy  -  Price : 27.0

GlobalDirectory.pdf   -  (537/590)

 ['worldbank'] 
 ['Myoung3'] 
 ['TI'] 
 [] 

Call : None  -  Price : None



 ['jmfl'] 
 ['ashutosh.somani', 'nitin.agarwala'] 
 [] 
 [] 

Call : None  -  Price : None

HDFC Securities 16-May-16.pdf   -  (582/590)

 ['hdfcsec'] 
 ['abdul.karim', 'hdfcsecretailresearch'] 
 ['HDFC'] 
 [] 

Call : None  -  Price : None

HDFC+Warrants.pdf   -  (583/590)

 ['nirmalbang'] 
 ['vishal.jajoo'] 
 ['HDFC', 'HDFC IN'] 
 [] 

Call : Buy  -  Price : 2008.0

BATINDIA_20130729.pdf   -  (584/590)

 ['nomura'] 
 ['manish.jain', 'anup.sudhendranath'] 
 ['NEST IN', 'CLGT IN', 'GCPL IN', 'HUVR IN', 'DABUR', 'APNT IN', 'TTAN IN', 'LT', 'MRCO IN', 'DABUR IN', 'ITC', 'UNSP IN', 'SKB IN', 'ITC IN', 'JUBI IN', 'BATA IN', 'HMN IN'] 
 [] 

Call : Buy  -  Price : 990.0

FIEM Industries - IC - Centrum - 27.03.14.pdf   -  (585/590)

 ['centrum'] 
 ['compliance', 'investor.grievances', 'ajay.shethiya'] 
 ['M&M', 'CDSL', 'NTL', 'MCX', 'MIC', 'FIEM IN'] 
 [] 

Call : Buy  -  Price : 545.0

ANewParadigmforManagingforShareholderValue.pdf   -  (586/590)

 ['asseteconomics'] 
 ['john.j.ballow', 'm

In [33]:
df

Unnamed: 0,file,source company,authors,company symbols,company names,trade call,target price,possible sources
0,JPM_022014_14225.pdf,[jpmorgan],"[deepika.mundra, sumit.x.kishore, boris.cw.kan]","[BHEL, KEC, ABB]","[Apar Industries Ltd., Triveni Turbine Ltd., B...",Sell,20.0,"[jpmorgan, Company data.]"
1,Deep Industries Ltd - Award of Hiring of Integ...,[deepindustrtes],[info],[],[],,,[deepindustrtes]
2,Industry-Map-of-the-Precious-Metals-Sector.pdf,[Others],[john],[],[],,,[]
3,Ambit India Access_Pre-Budget conference call ...,[ambitcapital],[None],[],[],,,"[ambitpte, ambitcapital, panmure]"
4,ThinkSoft-Corporate-Presentation-Mar-12.pdf,[thinksoftglobal],"[dpingle, vaidyanathan.n]",[ICICI],[],,,"[christensenir, thinksoftglobal, NASSCOM Report]"
...,...,...,...,...,...,...,...,...
585,ANewParadigmforManagingforShareholderValue.pdf,[asseteconomics],"[john.j.ballow, michael.j.molnar, roland.burgm...",[TI],[],Sell,7.0,"[asseteconomics, accenture, Inc.]"
586,TEL-20180108-MOSL-SU-PG006.pdf,[motilaloswal],"[Hafeez.Patel, Aliasgar.Shakir]",[],[],Buy,48.0,"[motilaloswal, Company]"
587,Titan - SELL - CLSA.pdf,[www.clsa.com],[vivek.maheshwari],[TI],[],Sell,18.0,"[clsa, www.clsa.com]"
588,BHEL-2QFY16 Review.pdf,[sparkcapital],"[raghavan, ravi]","[BHEL, BHEL IN]",[],Sell,137.0,[sparkcapital]


In [34]:
df.to_csv('pdf_extracted_info.csv')

In [28]:
file_list[261]

'IndusInd_Bank_-_4QFY18_-_HDFC_sec.pdf'

# Some Unsuccessful attemps

In [None]:
# extract text from text layout of pdfminer 


from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar,LTLine,LAParams
Extract_Data=[]
for PDF_file in file_list[:3]:
       for page_layout in extract_pages(source_path+PDF_file):
            print(PDF_file)
            for element in page_layout:
                print(element)
                if isinstance(element, LTTextContainer):
                    for text_line in element:
                        for character in text_line:
                            if isinstance(character, LTChar):
                                Font_size=character.size
                    Extract_Data.append([Font_size,(element.get_text())])

In [None]:
## pdf image extraction and ocr on the extracted image

def process_images_ocr(file_list):

    import fitz
    import pytesseract
    from collections import defaultdict

    imgtext_corpus = defaultdict(set)

    for file in file_list:

        print(file)
        doc = fitz.open(source_path+file)
        print(len(doc))

        pages_to_scan = [0]
        if(len(doc)>1):
            pages_to_scan.append(len(doc)-1)

        try:
            for i in pages_to_scan:
                #print(doc.getPageImageList(i))
                for img in doc.getPageImageList(i):
                    #print(img)
                    xref = img[0]
                    pix = fitz.Pixmap(doc, xref)
                    img_name = "./images/%s-%s.png" % (file, i)

                    if pix.n < 5:       # this is GRAY or RGB
                        pix.writePNG(img_name)

                    else:               # CMYK: convert to RGB first
                        pix1 = fitz.Pixmap(fitz.csRGB, pix)
                        pix1.writePNG(img_name)
                        pix1 = None

                    img_text = pytesseract.image_to_string(Image.open(img_name))
                    print(img_text)
                    imgtext_corpus[file].add(img_text)
                    pix = None
        except:
            continue
            
    save_obj('imgtext_corpus.pkl', imgtext_corpus)

In [None]:
## using tabula package to identify possible tables in a pdf page

import tabula 
file = '[Kotak] Karur Vysya Bank, July 25, 2018.pdf'
df = tabula.read_pdf(source_path+file, pages='1', multiple_tables=True)
print(df)