In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

In [2]:
#create a list of all the yc batches
batches = ['s05', 'w06', 's06', 'w07', 's07', 'w08', 's08', 'w09', 's09', 'w10', 's10', 'w11', 's11', 'w12', 's12', 'w13', 's13', 'w14', 's14', 'w15', 's15', 'w16', 's16', 'w17', 's17', 'w18', 's18', 'w19', 's19', 'w20']

In [3]:
def get_companies(batch):
    url = f'https://www.ycdb.co/batch/{batch}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table')
    companies = []
    # Extract the rows from the table
    rows = table.find_all('tr')

    for row in rows:
        # Extract company name and growth score from each row
        columns = row.find_all('td')
        if columns:
            companies.append(name_clean(columns[0].a.text.strip()))

    return companies

def name_clean(name):
    #replace all spaces with dash and lowercase the name
    name = name.replace(' ', '-').lower()
    return name

In [4]:
get_companies('s05')

['clickfacts',
 'infogami',
 'kiko',
 'loopt',
 'memamp',
 'parakey',
 'posthaven',
 'reddit',
 'simmery',
 'textpayme',
 'y-combinator']

In [5]:
# for each batch, get all the companies, and add to the master list
all_companies = []
for batch in batches:
    companies = get_companies(batch)
    all_companies.extend(companies)


In [6]:
len(all_companies)

2206

In [21]:
used = set()
broken = set()
companies = []

In [19]:
def get_company_info(soup):
    company_info = {}

    # Scrape the company name
    company_name = soup.find('h1').get_text(strip=True)
    company_info['Company Name'] = company_name

    # Find the status of the company
    status_badges = soup.find('span', class_='badge')
    statuses = status_badges.get_text(strip=True)
    company_info['Status'] = statuses

    # Consolidate card information into a single dictionary
    keys = soup.find_all('h6', class_='text-secondary')
    for key in keys:
        value_container = key.find_next('div', class_='badge')
        if value_container:
            key_text = key.get_text(strip=True).split(':')[0]
            value_text = value_container.get_text(strip=True)
            company_info[key_text] = value_text

    # Find batch, founded year, and location
    details = soup.find_all('p', class_='lighter')
    for detail in details:
        text = detail.get_text(strip=True)
        key, _, value = text.partition(':')
        company_info[key.strip()] = value.strip()

    return company_info

In [22]:
for company in all_companies:
    if company not in used and company not in broken:
        req = requests.get(f"https://www.ycdb.co/company/{company}")
        if req.status_code != 200:
            broken.add(company)
        else:
            companies.append(get_company_info(BeautifulSoup(req.text, 'html.parser')))
            used.add(company)
        print("used: ", len(used), "broken: ", len(broken))

used:  1 broken:  0
used:  2 broken:  0
used:  3 broken:  0
used:  4 broken:  0
used:  5 broken:  0
used:  6 broken:  0
used:  7 broken:  0
used:  8 broken:  0
used:  9 broken:  0
used:  10 broken:  0
used:  11 broken:  0
used:  12 broken:  0
used:  13 broken:  0
used:  14 broken:  0
used:  15 broken:  0
used:  16 broken:  0
used:  17 broken:  0
used:  18 broken:  0
used:  19 broken:  0
used:  20 broken:  0
used:  21 broken:  0
used:  22 broken:  0
used:  23 broken:  0
used:  24 broken:  0
used:  25 broken:  0
used:  26 broken:  0
used:  27 broken:  0
used:  28 broken:  0
used:  29 broken:  0
used:  30 broken:  0
used:  31 broken:  0
used:  32 broken:  0
used:  33 broken:  0
used:  34 broken:  0
used:  35 broken:  0
used:  36 broken:  0
used:  37 broken:  0
used:  38 broken:  0
used:  39 broken:  0
used:  40 broken:  0
used:  41 broken:  0
used:  42 broken:  0
used:  42 broken:  1
used:  43 broken:  1
used:  44 broken:  1
used:  45 broken:  1
used:  46 broken:  1
used:  47 broken:  1
u

In [14]:
companies

[{'Company Name': 'Clickfacts',
  'Status': 'Exited',
  'Alexa Rank': '5,337,279',
  'Domain Authority': '25',
  'Domains Linking': '119',
  'Twitter Followers': '14',
  'Tweet Count': '3',
  'LinkedIn Followers': '9',
  'Funding': '$0.8m',
  'Growth Score': '-4',
  'Batch': 'S05',
  'Category': 'Other SaaS',
  'Location': 'San Francisco, California, United States'},
 {'Company Name': 'Infogami',
  'Status': 'Dead',
  'Domain Authority': '45',
  'Domains Linking': '3,020',
  'Batch': 'S05',
  'Category': 'Other SaaS'},
 {'Company Name': 'Kiko',
  'Status': 'Exited',
  'Alexa Rank': '2,143,990',
  'Domain Authority': '46',
  'Domains Linking': '1,230',
  'Funding': '$0.1m',
  'Growth Score': '3',
  'Exit Value': '$0.3m',
  'Batch': 'S05',
  'Category': 'Other SaaS',
  'Location': 'Cambridge, Massachusetts, United States'},
 {'Company Name': 'Loopt',
  'Status': 'Exited',
  'Alexa Rank': '7,399,811',
  'Domain Authority': '61',
  'Domains Linking': '3,686',
  'Twitter Followers': '381',


In [23]:
df = pd.DataFrame(companies)
df.to_csv('yc_companies.csv', index=False)

In [24]:
df.head()

Unnamed: 0,Company Name,Status,Alexa Rank,Domain Authority,Domains Linking,Twitter Followers,Tweet Count,LinkedIn Followers,Funding,Growth Score,Batch,Category,Location,Exit Value,Employees,Founded,Facebook Likes,Product Hunt Votes
0,Clickfacts,Exited,5337279.0,25,119,14.0,3.0,9.0,$0.8m,-4.0,S05,Other SaaS,"San Francisco, California, United States",,,,,
1,Infogami,Dead,,45,3020,,,,,,S05,Other SaaS,,,,,,
2,Kiko,Exited,2143990.0,46,1230,,,,$0.1m,3.0,S05,Other SaaS,"Cambridge, Massachusetts, United States",$0.3m,,,,
3,Loopt,Exited,7399811.0,61,3686,381.0,496.0,484.0,$39.1m,-3.0,S05,Entertainment,"Mountain View, California, United States",$43.4m,15.0,2005.0,,
4,Memamp,Dead,,3,3,,,,,,S05,,,,,,,


In [25]:
broken

{'80,000-hours',
 'anywhere.fm',
 'apollo.io',
 'bop.fm',
 'bus.com',
 'click-&-grow',
 'daily.co',
 'delt.ai',
 'demeanor.co',
 'disclosures.io',
 'documents.me',
 'estimote,-inc.',
 'everyday.me',
 'evo.do',
 'exosonic,-inc.',
 'flower-co.',
 'fly.io',
 'glowing.io',
 'handle.com',
 'instapath-inc.',
 'jerry,-inc.',
 'l.',
 'leon-&-george',
 "let's-do-this",
 'lively,-inc.',
 'lowkey.gg',
 'lucy-goods,-inc',
 'maitian.ai',
 'manycore.io',
 'mipos.dev',
 'miso',
 'mosaix.ai',
 'mystery.org',
 'names-&-faces',
 'neptune.io',
 "newman's",
 'numericcal,-inc.',
 'observe.ai',
 "osh's-affordable-pharmaceuticals",
 'people-&-pages',
 'people.ai',
 'percept.ai',
 'pit.ai',
 "players'-lounge",
 'post.fm',
 'qulture.rocks',
 'rejuvenation-technologies-inc.',
 'repl.it',
 'sails-co.',
 'scaphold.io',
 'screenleap,-inc.',
 'sleepwell.ai',
 'stoic.',
 'stories-inc.',
 'treble.ai',
 'turing-labs-inc.',
 'vendr.com',
 'veryfi,-inc.',
 'vida-&-co.',
 'vote.org',
 'wheelys-caf√©',
 'wit.ai',
 'women.