In [1]:
from collections import Counter
import math as m
import sqlite3
import spacy
import en_core_web_md
nlp = en_core_web_md.load()

In [5]:
%%timeit
class Article:
    __slots__ = 'title', 'timestamp', 'text', 's3_key', 'heuristics' #, 'entity'

    def __init__(self, title, timestamp, text, s3_key, heuristics):
        self.title = title
        self.timestamp = timestamp
        self.text = text
        # self.html = get_s3_data(s3_key, 'html')
        self.s3_key = s3_key
        self.heuristics = heuristics
        #self.entity = det_entity(heuristics)

def get_heuristics(doc_nlp,title,filelength):
    
    # Initialize dictionary to store heuristics for each organization
    orgs_heuristic_dict = {}
    # Sum of occurances of all organizations
    org_count = 0
    
    # loop through all organizations in the article
    for entity in doc_nlp.ents:
        if entity.label_ == 'ORG':
            
            # Sum of occurances of all organizations
            org_count += 1
            
            # Lemma form of the organization name
            org_name = entity.text.replace("'s","").replace(u"’s","").replace("'","")
            
            # Check if org is already present in the dictionary
            if org_name not in orgs_heuristic_dict:
                # Get value for 2nd heuristic: position. Formula is exp(-position/length of file)
                org_pos = m.exp(((entity.start_char)/filelength)*-1)
                
                #Check for 3rd heuristic: presence in title
                if title.find(org_name) == -1:
                    orgs_heuristic_dict[org_name] = [org_pos, 1, 0] # not present in title
                else:
                    orgs_heuristic_dict[org_name] = [org_pos, 1, 1] # present in title
            else:
                # Get count of the organization by adding 1 each time it comes up
                orgs_heuristic_dict[org_name][1] += 1
    
    # Loop through above dictionary
    for org in orgs_heuristic_dict.keys():
        orgs_heuristic_dict[org][1] = orgs_heuristic_dict[org][1]/org_count # normalize count of orgs
        
        # calc heuristics total formula
        orgs_heuristic_dict[org].append(sum(orgs_heuristic_dict[org])) # append sum of all three heuristics
    
    return(orgs_heuristic_dict)

  

article_list = []
  

  
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e) 
    return None

conn = create_connection('text_data.db')

process_rows = 10

res = conn.execute("select * FROM datallica_data limit " + str(process_rows))

for i in range(process_rows):
    # 0 - s3_key, 1 - url, 2 - domain, 3 - title, 4 - text_data, 5 - timestamp
    # self, title, timestamp, text, s3_key, heuristics
    row = res.fetchone()
    article_list.append(Article(row[3],
    row[5],
    str(row[4]),
    row[0],
    get_heuristics(nlp(row[3]),row[4],len(row[3]))))
    
for a in article_list:
    print(a.title, a.heuristics)

Waitrose Food and Drink reveals Britain's favourite drinks {'Waitrose Food and Drink': [1.0, 1.0, 1, 3.0]}
Pick-up truck ad against GOP pulled after terror attack {'GOP': [0.6347364189402819, 1.0, 1, 2.6347364189402818]}
Deutsche Bank AG (DBK) – Research Analysts’ Recent Ratings Updates {'Deutsche Bank AG': [1.0, 0.5, 1, 2.5], 'DBK': [0.7613003866968737, 0.5, 1, 2.261300386696874]}
The Latest: Nurse plans to use settlement for aid projects {}
Driver kills at least 8 in Manhattan in what officials say was terror attack {}
Poll: Which NFL trade acquisition will have the biggest impact on his new team? {'NFL': [0.8590752713006462, 1.0, 1, 2.859075271300646]}
ARI Network Services (ARIS) Earning Somewhat Favorable News Coverage, Accern Reports {'ARI Network Services': [1.0, 0.5, 1, 2.5], 'ARIS': [0.7695843139616951, 0.5, 1, 2.269584313961695]}
Scientists suggest that aliens will 'look like us' {}
IDBI Bank’s Q2 profit tanks as NPAs swell {'IDBI Bank': [1.0, 1.0, 1, 3.0]}
SBI cuts benchmark 