# Set up

In [1]:
import jsonlines
import os
import pandas as pd
import copy
import json
import re

# Assemble data

Aim is to structure a company like:
```
{"CTVA": {
"name": "Corteva, Inc.", 
"location": ["Chestnut Run Plaza 735", "PO Box 80735", "United States"], 
"website": "http://www.corteva.com", 
"section": "Basic Materials", 
"industry": "Agricultural Inputs", 
"profile": "https://finance.yahoo.com/quote/CTVA/profile?p=CTVA"},
"stock prices": pandas.framework,
"ROA": pandas.framework
}
```

One `json` file corresponds to a company.  

However, `json` can't load the `pandas.framework` directly, and to avoid losing data and adding escape characters, we use `pandas.to_dict()` to encode `pandas.framework`, then we can store it in `json`.

As for decoding the `json`, we use `pandas.DataFrame.from_dict()` to transform the `dict` to `dataframe`. Thus, the final structure for each file (company) is:
```
{"CTVA": {
"name": "Corteva, Inc.", 
"location": ["Chestnut Run Plaza 735", "PO Box 80735", "United States"], 
"website": "http://www.corteva.com", 
"section": "Basic Materials", 
"industry": "Agricultural Inputs", 
"profile": "https://finance.yahoo.com/quote/CTVA/profile?p=CTVA"},
"stock prices": dict,
"ROA": dict
}
```

## Get the company basic information

In [2]:
root_path = os.path.abspath(os.path.dirname(os.getcwd()))
web_path = os.path.join(root_path, 'webpage')

total = {}
for f in os.listdir(web_path):
    if f.endswith('.jsonl'):
        inds = f.replace('.jsonl', '')
        total[inds] = {}
#         total[inds] = []
        file_path = os.path.join(web_path, f)
        with jsonlines.open(file_path, 'r') as lines:
            for line in lines:
                total[inds][list(line.keys())[0]] = list(line.values())[0]

In [3]:
total.keys()

dict_keys(['agriculture', 'energy', 'travel'])

In [4]:
agriculture = total['agriculture']
energy = total['energy']
travel = total['travel']

In [5]:
agriculture

{'CTVA': {'name': 'Corteva, Inc.',
  'location': ['Chestnut Run Plaza 735', 'PO Box 80735', 'United States'],
  'website': 'http://www.corteva.com',
  'section': 'Basic Materials',
  'industry': 'Agricultural Inputs',
  'profile': 'https://finance.yahoo.com/quote/CTVA/profile?p=CTVA'},
 'MBII': {'name': 'Marrone Bio Innovations, Inc.',
  'location': ['1540 Drew Avenue', 'Davis, CA 95618', 'United States'],
  'website': 'http://www.marronebio.com',
  'section': 'Basic Materials',
  'industry': 'Agricultural Inputs',
  'profile': 'https://finance.yahoo.com/quote/MBII/profile?p=MBII'},
 'AVD': {'name': 'American Vanguard Corporation',
  'location': ['4695 MacArthur Court',
   'Newport Beach, CA 92660',
   'United States'],
  'website': 'http://www.american-vanguard.com',
  'section': 'Basic Materials',
  'industry': 'Agricultural Inputs',
  'profile': 'https://finance.yahoo.com/quote/AVD/profile?p=AVD'},
 'ICL': {'name': 'ICL Group Ltd',
  'location': ['Millenium Tower',
   '23 Aranha Str

## Store the stock prices

In [6]:
data_path = os.path.join(root_path, 'data')
cp = copy.deepcopy(total)

for inds in cp.keys():
    inds_path = os.path.join(data_path, inds)
    for comp in cp[inds].keys():
        file_name = comp + '.csv'
        file_path = os.path.join(inds_path, file_name)
        try:
            total[inds][comp]["stock prices"] = pd.read_csv(file_path).to_dict()
        except:
            del total[inds][comp]

## Store the ROA/ROI data

In [7]:
inds_path = os.path.join(data_path, 'agriculture')
fina_path = os.path.join(inds_path, 'Financial Statistic_Agriculture')

cp = copy.deepcopy(total)

for comp in cp['agriculture'].keys():
    file_name = comp + '_annual_financials.csv'
    file_path = os.path.join(fina_path, file_name)
    df = pd.read_csv(file_path)
    df = df.loc[df['name']=='ROA']
    total['agriculture'][comp]["ROA"] = df.to_dict()

## Store the State data of US

REF: https://gist.github.com/rogerallen/1583593

In [52]:
# https://www.corteva.com/contact-us.html
agriculture['CTVA']['location'] = ['Chestnut Run Plaza 735', 'PO Box 80735', 
                                   'Wilmington, DE 19805-0735', 'United States']
# https://ir.mgpingredients.com/governance/governance-contacts
agriculture['MGPI']['location'] = ['Cray Business Plaza', '100 Commercial Street PO Box 130',
                                   '100 Commercial Street, P.O. Box 130 ', 'Atchison, KS 66002',
                                   'United States']

In [53]:
# load us state abbre data
file_name = os.path.join(data_path, 'us_state.json')
with open(file_name, 'r') as f:
    us_state_dict = json.load(f)

count = 0
for comp, v in agriculture.items():
    state_str = v['location'][-2]
    print(comp, v['location'])
    state_pattern = re.compile(r'[A-Z]{2}(?= \d)')
    try:
        state = re.findall(state_pattern, state_str)[0]
        print(state)
        state = us_state_dict[state]
        agriculture[comp]['state'] = state
        count += 1
    except:
        pass
count

CTVA ['Chestnut Run Plaza 735', 'PO Box 80735', 'Wilmington, DE 19805-0735', 'United States']
DE
MBII ['1540 Drew Avenue', 'Davis, CA 95618', 'United States']
CA
AVD ['4695 MacArthur Court', 'Newport Beach, CA 92660', 'United States']
CA
ICL ['Millenium Tower', '23 Aranha Street', 'Tel Aviv 6107025', 'Israel']
IPI ['1001 17th Street', 'Suite 1050', 'Denver, CO 80202', 'United States']
CO
NTR ['Suite 500', 'Saskatoon, SK S7K 7G3', 'Canada']
MGPI ['Cray Business Plaza', '100 Commercial Street PO Box 130', '100 Commercial Street, P.O. Box 130 ', 'Atchison, KS 66002', 'United States']
KS
SMG ['14111 Scottslawn Road', 'Marysville, OH 43041', 'United States']
OH
CF ['4 Parkway North', 'Suite 400', 'Deerfield, IL 60015', 'United States']
IL
YTEN ['19 Presidential Way', 'Woburn, MA 01801', 'United States']
MA
CTA-PB ['974 Centre Road', 'Wilmington, DE 19805', 'United States']
DE
UAN ['2277 Plaza Drive', 'Suite 500', 'Sugar Land, TX 77479', 'United States']
TX
CGA ['Borough A', 'China']
CTA-PA 

14

In [47]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="find state")
location = geolocator.geocode("Cray Business Plaza")
print(location.address)

Cray, Powys, Cymru / Wales, United Kingdom


# Store the data into json for each company

In [54]:
for comp in total['agriculture'].keys():
    file_name = comp + '.json'
    file_path = os.path.join(inds_path, file_name)
    with open(file_path, 'w') as file:
        json.dump({comp:total['agriculture'][comp]}, file)

# Read the json

In [None]:
t = []
for comp in total['agriculture'].keys():
    file_name = comp + '.json'
    file_path = os.path.join(inds_path, file_name)
    with open(file_path, 'r') as file:
        t.append(json.load(file))

In [None]:
pd.DataFrame.from_dict(t[0]['CTVA']['stock prices'])