# <center>Gathering Supply Chain Data<center>

- Date: March 2019
- Author: Minsu Yeom, CFA, FRM

In [1]:
import os
import numpy as np
import pandas as pd
import bamboolib as bam
import xlwings as xw
from tqdm import tqdm
from datetime import date
from utils.misc import get_excel_column_name, get_last_bday

In [2]:
# Reload all modules (except those excluded by %aimport) every time before executing the Python code typed.
%load_ext autoreload
%autoreload 2

# 1. Loading Equity Universe

In [3]:
universe = pd.read_csv('./dataset/comprel/eqy_universe_gt50mm.log',
                         header=1,
                         names=[
                             'Identifier', 'Name', 'Revenue', 'Company Type',
                             'Business Description', 'FactSet Industry',
                             'Crunchbase Category(BETA)',
                             'Crunchbase Rank(BETA)', 'Ultimate Parent Name',
                             'Fiscal Year End', 'Country', 'Website'
                         ], 
                         parse_dates = ['Fiscal Year End']
                        )

#### Change dtypes properly

In [4]:
universe.loc[:, 'Revenue'] = pd.to_numeric(universe.loc[:,'Revenue'], errors='coerce')
universe.loc[:, 'Fiscal Year End'] = pd.to_datetime(universe.loc[:, 'Fiscal Year End'], errors='coerce')

In [5]:
universe.dtypes

Identifier                           object
Name                                 object
Revenue                             float64
Company Type                         object
Business Description                 object
FactSet Industry                     object
Crunchbase Category(BETA)            object
Crunchbase Rank(BETA)                object
Ultimate Parent Name                 object
Fiscal Year End              datetime64[ns]
Country                              object
Website                              object
dtype: object

#### Setting the minimum market cap criteria in USD mm
- Reset the index.

In [6]:
min_market_cap = 200
universe = universe.loc[universe.loc[:, 'Revenue'] >= min_market_cap, :]
universe = universe.reset_index(drop = True)

### Missing value analysis

- Looks all good.

In [7]:
universe.isna().sum()

Identifier                     0
Name                           0
Revenue                        0
Company Type                   0
Business Description           0
FactSet Industry               0
Crunchbase Category(BETA)      0
Crunchbase Rank(BETA)          0
Ultimate Parent Name           0
Fiscal Year End                1
Country                        0
Website                      127
dtype: int64

#### Launch Excel with making it sure FastSet add-ins enabled
- `fdswFixExcel.exe` enables the add-ins.
- `retcode`: 0 if runs fine. 1 if any error.

In [8]:
import subprocess
retcode = subprocess.run(['C:/Program Files (x86)/FactSet/fdswFixExcel.exe'])

#### Set an Excel header

In [9]:
header_meta = ['date', 'ticker',  'relationship']
header_fcst = ['rel_comp_tic', 'rel_comp_nm', 'no_overlap', 'pct_overlap', 'revenue_dependence', 'source', 'prtr_rel']
header = header_meta + header_fcst

#### Set headers for Excel rows, and Excel ranges for a relationship data set

Headers

In [10]:
# Excel column names for meta data
meta_row = {}
for i, row_no in enumerate(header_meta):
    meta_row[row_no] = i+1

per_company_cols = 500
per_company_rows = len(header)
# The number of companies to extract at one run. We do this number of times until we extract all information.
batch_sz = 10   
alphabet_count = 26

fcst_start_col = 'A'
fcst_end_col = get_excel_column_name(per_company_cols-1)
fcst_start_row =  len(header_meta)+1
fcst_end_row = per_company_rows

The whole data range

In [11]:
data_height = len(header) * batch_sz
data_range = [
    'A' + str(i * per_company_rows + 1) + ':' +
    str(get_excel_column_name(per_company_cols - 1)) + str(
        (i + 1) * per_company_rows) for i in range(batch_sz)
]

#### FactSet settings

In [12]:
# SUPL: Suppliers. 
# CUST: Customers
# PRTR: Partners
# COMP: Competitors
# It's FactSet's codes.
relationships = ['SUPL', 'CUST', 'PRTR', 'COMP']

# =FDS("GOOGL-US", "TRANSPOSE(FF_COMPANY_RELATIONSHIP(SUPL,PUB,Ticker,ALL,ALL))")
fds_fn = '", "TRANSPOSE(FF_COMPANY_RELATIONSHIP('
fds_param = ',PUB,Ticker,ALL,ALL))")'

#### Load company relationship data from FactSet and write it in an Excel workbook by
- Writing `=FDS()` directly an Excel sheet named as in `relationships`
- Allocating `rows` rows for each company

In [13]:
def get_company_relationships(tickers):
    ws = {}
    fcst_ranges = {}

    # Iterate over ['SUPL', 'CUST', 'PRTR', 'COMP']
    for i, rel in enumerate(relationships):
        ws[rel] = xw.sheets.add(rel)    # Create a sheet per relationship

        # Load company relationship data and write it in the workbook.
        for j, ticker in enumerate(tickers):
            fcst_ranges[ticker] = '{:s}{:d}:{:s}{:d}'.format(fcst_start_col, j*per_company_rows+fcst_start_row, fcst_end_col, j*per_company_rows+fcst_end_row)
            ws[rel].range(fcst_ranges[ticker]).formula_array = '=FDS("' + ticker + fds_fn + rel + fds_param
                        
            # For each company, we write Date, Ticker and Relationship for the corresponding rows in the workbook.
            for meta_type, row_no in meta_row.items():
                row = j*per_company_rows+row_no
                meta_range = '{:s}{:d}:{:s}{:d}'.format('A', row, fcst_end_col, row)
                meta_value = get_last_bday() if meta_type == 'date' else rel if meta_type == 'relationship' else ticker
                ws[rel].range(meta_range).value = meta_value
    
    return ws

#### Extract data from each sheet in `ws` and merge it into `df`
- We also do: resetting the index and setting the column names.

In [14]:
universe

Unnamed: 0,Identifier,Name,Revenue,Company Type,Business Description,FactSet Industry,Crunchbase Category(BETA),Crunchbase Rank(BETA),Ultimate Parent Name,Fiscal Year End,Country,Website
0,2GO-PH,"2GO Group, Inc.",413.47714,Public Company,"2GO Group, Inc. engages in the provision of pr...",Marine Shipping,-,-,"2GO Group, Inc.",2019-12-31,Philippines,http://www.2go.com.ph
1,601360-CN,"360 Security Technology, Inc.",1948.56120,Public Company,"360 Security Technology, Inc. provides interne...",Packaged Software,-,-,"360 Security Technology, Inc.",2018-12-31,China,http://www.360.cn
2,1361-HK,361 Degrees International Ltd.,815.04840,Public Company,361 Degrees International Ltd. is an investmen...,Apparel/Footwear,-,-,361 Degrees International Ltd.,2019-12-31,China,http://www.361sport.com
3,523395-IN,3M India Limited,431.16843,Public Company,3M India Ltd. engages in the business of indus...,Specialty Telecommunications,-,-,3M Company,2019-03-31,India,http://www.3mindia.in
4,1530-HK,"3SBio, Inc.",692.74005,Public Company,"3SBio, Inc. operates as in investment holding ...",Biotechnology,-,-,"3SBio, Inc.",2018-12-31,China,http://www.3sbio.com
...,...,...,...,...,...,...,...,...,...,...,...,...
14236,ZM-US,"Zoom Video Communications, Inc.",622.65800,Public Company,"Zoom Video Communications, Inc. engages in the...",Packaged Software,-,-,"Zoom Video Communications, Inc.",2020-01-31,United States,http://www.zoom.us
14237,ZS-US,"Zscaler, Inc.",302.83600,Public Company,"Zscaler, Inc. engages in the provision of clou...",Packaged Software,-,-,"Zscaler, Inc.",2019-07-31,United States,http://www.zscaler.com
14238,ZUMZ-US,Zumiez Inc.,1034.12900,Public Company,"Zumiez, Inc. engages in retailing apparel, foo...",Apparel/Footwear Retail,-,-,Zumiez Inc.,2020-02-01,United States,http://www.zumiez.com
14239,ZUO-US,"Zuora, Inc.",276.05700,Public Company,"Zuora, Inc. operates as an online subscription...",Packaged Software,-,-,"Zuora, Inc.",2020-01-31,United States,http://www.zuora.com


In [15]:
df = pd.DataFrame()
ws_list = []
# total_sz = 100 # The total number of target companies to extract
total_sz = universe.shape[0] # The total number of target companies to extract
save_unit = 500
for idx in tqdm(range(0, total_sz, batch_sz)):
#     print(idx, end = ' ')
    tickers = universe.loc[idx:idx+batch_sz-1, 'Identifier'].to_list()
    ws = get_company_relationships(tickers)

    for sheet in ws.keys():
        for r in data_range:
            ws_list.append(pd.DataFrame(ws[sheet].range(r).options(transpose=True).value))
        ws[sheet].delete()
    
    # Pickle each intermediate result per `save_unit` company.
    if idx != 0 and (idx % save_unit == 0):
        new_df = pd.concat(ws_list)
        new_df = new_df.dropna(subset=[header.index('rel_comp_tic')])
        new_df.to_pickle('./dataset/comprel/pickle/company_relationships_' + str(idx) + '.pkl')
        df = pd.concat([df, new_df])        
        del ws_list
        del new_df
        ws_list = []
    
df.columns = header
df = df.reset_index(drop = True)

100%|████████████████████████████████████████████████████████████████████████████| 1425/1425 [4:56:38<00:00, 12.49s/it]


In [16]:
df.to_pickle('./dataset/comprel/company_relationships.pkl')

In [17]:
df.to_feather('./dataset/comprel/company_relationships.feather')

## ----------------- The end of work -------------------