In [3]:
# import packages
import glob, os
import json
import numpy as np
import pandas as pd
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

pd.set_option('display.max_rows', None)

In [4]:
# read data from file and parse each line into a json object
in_f = os.path.join("ai_in_finance.json")
data = [json.loads(line) for line in open(in_f, 'r', encoding='utf-8')]

In [5]:
# extract string from text key in json object
all_text = [d['text'] for d in data]

In [6]:
# tokenize each string using spaCy
all_doc = [nlp(text) for text in all_text]

In [7]:
# extract entities from each document
all_ent = [ent for doc in all_doc for ent in doc.ents]

In [8]:
# extract entities which are labeled as 'ORG' to filter company entities only
all_org_ent_text = [ent.text.lower() for ent in all_ent if ent.label_ == 'ORG']

In [9]:
# create a dataframe from the list of company entities
df = pd.DataFrame(all_org_ent_text, columns=['company'])

# create frequency table using groupby from pandas
freq_table = df.groupby(['company']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [10]:
# print frequency table
freq_table

Unnamed: 0,company,counts
8274,fed,2143
19870,sec,1356
9819,google,743
8118,facebook,673
7890,eu,651
26476,wells fargo,629
24475,the wall street journal,621
25060,treasury,614
1606,apple,611
25278,twitter,588
