# Set up

In [102]:
import jsonlines
import os
import pandas as pd
import copy
import json

# Assemble data

Aim is to structure a company like:
```
{"CTVA": {
"name": "Corteva, Inc.", 
"location": ["Chestnut Run Plaza 735", "PO Box 80735", "United States"], 
"website": "http://www.corteva.com", 
"section": "Basic Materials", 
"industry": "Agricultural Inputs", 
"profile": "https://finance.yahoo.com/quote/CTVA/profile?p=CTVA"},
"stock prices": pandas.framework,
"ROA": pandas.framework
}
```

One `json` file corresponds to a company.  

However, `json` can't load the `pandas.framework` directly, and to avoid losing data and adding escape characters, we use `pandas.to_dict()` to encode `pandas.framework`, then we can store it in `json`.

As for decoding the `json`, we use `pandas.DataFrame.from_dict()` to transform the `dict` to `dataframe`. Thus, the final structure for each file (company) is:
```
{"CTVA": {
"name": "Corteva, Inc.", 
"location": ["Chestnut Run Plaza 735", "PO Box 80735", "United States"], 
"website": "http://www.corteva.com", 
"section": "Basic Materials", 
"industry": "Agricultural Inputs", 
"profile": "https://finance.yahoo.com/quote/CTVA/profile?p=CTVA"},
"stock prices": dict,
"ROA": dict
}
```

## Get the company basic information

In [103]:
root_path = os.path.abspath(os.path.dirname(os.getcwd()))
web_path = os.path.join(root_path, 'webpage')

total = {}
for f in os.listdir(web_path):
    if f.endswith('.jsonl'):
        inds = f.replace('.jsonl', '')
        total[inds] = {}
#         total[inds] = []
        file_path = os.path.join(web_path, f)
        with jsonlines.open(file_path, 'r') as lines:
            for line in lines:
                total[inds][list(line.keys())[0]] = list(line.values())[0]

In [104]:
total.keys()

dict_keys(['agriculture', 'energy', 'travel'])

In [108]:
agriculture = total['agriculture']
energy = total['energy']
travel = total['travel']

In [109]:
agriculture

{'CTVA': {'name': 'Corteva, Inc.',
  'location': ['Chestnut Run Plaza 735', 'PO Box 80735', 'United States'],
  'website': 'http://www.corteva.com',
  'section': 'Basic Materials',
  'industry': 'Agricultural Inputs',
  'profile': 'https://finance.yahoo.com/quote/CTVA/profile?p=CTVA',
  'stock prices': {'Date': {0: '2019-05-24',
    1: '2019-05-28',
    2: '2019-05-29',
    3: '2019-05-30',
    4: '2019-05-31',
    5: '2019-06-03',
    6: '2019-06-04',
    7: '2019-06-05',
    8: '2019-06-06',
    9: '2019-06-07',
    10: '2019-06-10',
    11: '2019-06-11',
    12: '2019-06-12',
    13: '2019-06-13',
    14: '2019-06-14',
    15: '2019-06-17',
    16: '2019-06-18',
    17: '2019-06-19',
    18: '2019-06-20',
    19: '2019-06-21',
    20: '2019-06-24',
    21: '2019-06-25',
    22: '2019-06-26',
    23: '2019-06-27',
    24: '2019-06-28',
    25: '2019-07-01',
    26: '2019-07-02',
    27: '2019-07-03',
    28: '2019-07-05',
    29: '2019-07-08',
    30: '2019-07-09',
    31: '2019-07-1

## Store the stock prices

In [107]:
data_path = os.path.join(root_path, 'data')
cp = copy.deepcopy(total)

for inds in cp.keys():
    inds_path = os.path.join(data_path, inds)
    for comp in cp[inds].keys():
        file_name = comp + '.csv'
        file_path = os.path.join(inds_path, file_name)
        try:
            total[inds][comp]["stock prices"] = pd.read_csv(file_path).to_dict()
        except:
            del total[inds][comp]

## Store the ROA/ROI data

In [113]:
inds_path = os.path.join(data_path, 'agriculture')
fina_path = os.path.join(inds_path, 'Financial Statistic_Agriculture')

cp = copy.deepcopy(total)

for comp in cp['agriculture'].keys():
    file_name = comp + '_annual_financials.csv'
    file_path = os.path.join(fina_path, file_name)
    df = pd.read_csv(file_path)
    df = df.loc[df['name']=='ROA']
    total['agriculture'][comp]["ROA"] = df.to_dict()

# Store the data into json for each company

In [115]:
for comp in total['agriculture'].keys():
    file_name = comp + '.json'
    file_path = os.path.join(inds_path, file_name)
    with open(file_path, 'w') as file:
        json.dump({comp:total['agriculture'][comp]}, file)

# Read the json

In [116]:
t = []
for comp in total['agriculture'].keys():
    file_name = comp + '.json'
    file_path = os.path.join(inds_path, file_name)
    with open(file_path, 'r') as file:
        t.append(json.load(file))

In [122]:
pd.DataFrame.from_dict(t[0]['CTVA']['stock prices'])

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2019-05-24,32.000000,28.500000,29.750000,29.000000,44000.0,28.105646
1,2019-05-28,30.500000,28.510000,29.250000,28.764999,61500.0,27.877888
2,2019-05-29,29.900000,27.660000,29.900000,28.500000,207800.0,27.621063
3,2019-05-30,29.160000,28.219999,28.990000,28.799999,666000.0,27.911810
4,2019-05-31,28.590000,26.969999,28.500000,26.969999,853800.0,26.138248
...,...,...,...,...,...,...,...
401,2020-12-24,39.549999,39.029999,39.029999,39.500000,847600.0,39.386795
402,2020-12-28,39.570000,38.320000,39.369999,38.349998,2545800.0,38.240089
403,2020-12-29,38.919998,37.674999,38.669998,37.939999,3183900.0,37.831264
404,2020-12-30,38.689999,38.009998,38.189999,38.439999,3396800.0,38.329830
