In [8]:
from dotenv import load_dotenv
import logging
import pandas as pd
import os
from sqlalchemy.engine.url import URL
from sqlalchemy.engine import create_engine

from beis_indicators import project_dir


logger = logging.getLogger(__name__)

GTR_RAW_DIR = f'{project_dir}/data/raw/crunchbase/'

load_dotenv()
url = URL(drivername='mysql+pymysql',
        username=os.getenv('DB_USER'),
        password=os.getenv('DB_PASS'),
        host=os.getenv('DB_HOST'),
        port=os.getenv('DB_PORT'),
        database='production')
engine = create_engine(url)

In [3]:
import wikipedia 
from functools import lru_cache 
 
@lru_cache() 
def get_page_cats(query): 
    try:  
        page = wikipedia.page(query, auto_suggest=False) 
    except wikipedia.PageError:  
        return [] 
    return page.categories 
 
def is_tech(query, depth=0, max_depth=2): 
    #print('\t'*depth, query) 
    cats = get_page_cats(query) 
    if 'Technology-related lists' in cats: 
        return True 
    elif depth == max_depth: 
        return False 
    for cat in cats: 
        if is_tech(cat, depth=depth+1, max_depth=max_depth): 
            return True 
    return False 

In [15]:
from collections import Counter
from itertools import chain

In [31]:
Counter(list(chain(*df['category_groups_list'].dropna()))).most_common(None)

[('Software', 88),
 ('Media and Entertainment', 60),
 ('Hardware', 52),
 ('Financial Services', 45),
 ('Information Technology', 41),
 ('Health Care', 37),
 ('Internet Services', 37),
 ('Sales and Marketing', 34),
 ('Transportation', 31),
 ('Commerce and Shopping', 29),
 ('Data and Analytics', 28),
 ('Science and Engineering', 28),
 ('Sports', 28),
 ('Real Estate', 28),
 ('Food and Beverage', 27),
 ('Education', 21),
 ('Sustainability', 19),
 ('Platforms', 17),
 ('Administrative Services', 17),
 ('Energy', 17),
 ('Community and Lifestyle', 17),
 ('Travel and Tourism', 16),
 ('Consumer Goods', 16),
 ('Content and Publishing', 16),
 ('Professional Services', 15),
 ('Lending and Investments', 15),
 ('Design', 15),
 ('Privacy and Security', 15),
 ('Advertising', 14),
 ('Natural Resources', 13),
 ('Consumer Electronics', 13),
 ('Manufacturing', 12),
 ('Video', 12),
 ('Gaming', 12),
 ('Agriculture and Farming', 10),
 ('Mobile', 10),
 ('Music and Audio', 10),
 ('Payments', 10),
 ('Government 

In [32]:
from sklearn.preprocessing import MultiLabelBinarizer

In [42]:
tech_groups = ['Software', 'Hardware', 'Information Technology', 'Internet Services', 'Data and Analytics', 
 'Science and Engineering', 'Platforms', 'Consumer Electronics', 'Manufacturing', 'Gaming', 'Mobile',
 'Biotechnology', 'Apps', 'Artificial Intelligence', 'Energy']

In [51]:
with open('../../data/aux/cb_tech_groups.txt', 'w') as f:
    f.write('\n'.join(sorted(tech_groups)))

In [60]:
', '.join([f"'{t}'" for t in tech_groups])

"'Apps', 'Artificial Intelligence', 'Biotechnology', 'Consumer Electronics', 'Data and Analytics', 'Energy', 'Gaming', 'Hardware', 'Information Technology', 'Internet Services', 'Manufacturing', 'Mobile', 'Platforms', 'Science and Engineering', 'Software'"

In [34]:
with open('../../data/aux/cb_tech_groups.txt', 'r') as f:
    tech_groups = f.read().splitlines()

cat_query = 'SELECT * FROM crunchbase_category_groups'
df = pd.read_sql(cat_query, con=engine)
df['category_groups_list'] = df['category_groups_list'].str.split(',')
df = df[~pd.isnull(df['category_groups_list'])]

mlb = MultiLabelBinarizer()
mlb_vecs = mlb.fit_transform(df['category_groups_list'])
mlb_df = pd.DataFrame(data=mlb_vecs, columns=mlb.classes_, index=df.index)

ids = mlb_df[mlb_df[tech_groups].sum(axis=1) > 0].index

df = df.loc[ids]

In [40]:
df

Unnamed: 0,id,name,category_groups_list
0,ae8f68d2-9319-f2c2-3549-4f1ac2851660,3D Printing,[Manufacturing]
1,76c672c1-ef33-72f0-8027-d747c8c6e4ba,3D Technology,"[Hardware, Software]"
2,a60e9792-884d-0696-63f7-a9c0f3100ded,A/B Testing,[Data and Analytics]
3,ad091ee0-0259-f129-7777-10f65213584d,Accounting,"[Financial Services, Professional Services]"
4,420ed687-9b30-4463-115e-e6150ad85aa0,Ad Exchange,[Advertising]
...,...,...,...
737,4acdae6a-217f-be4d-5f3d-6955756b92d5,Winery,[Food and Beverage]
738,1101e2fd-fb06-08a8-9567-97652bbbe2cf,Wired Telecommunications,[Messaging and Telecommunications]
739,9f8132a0-2620-1f89-a2d2-fadbb3f16b4c,Wireless,"[Hardware, Mobile]"
741,176f72b5-3733-b98e-7ccb-db49dc29ed4a,Wood Processing,[Manufacturing]
