### Imports 

In [15]:
import pandas as pd
import os
import numpy as np
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nilanshadargan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nilanshadargan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Data Loading

In [10]:
ycd_data = 'Data/2023-07-13-yc-companies.csv'

In [11]:
ycd = pd.read_csv(ycd_data)

In [12]:
#Change column from string to list of strings
ycd['tags'] = ycd['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else x)

In [59]:
#Collecting AI companies
ai_companies = ycd[ycd['tags'].apply(lambda tags: 'Artificial Intelligence' in tags if isinstance(tags, list) else False)]
ai_companies = ai_companies.drop_duplicates(subset=['company_name'])
print(ai_companies)


      company_id company_name  \
2          28409      BerriAI   
9          28367    Atri Labs   
22         28183        Metal   
44         28114    BabylonAI   
49         28089       Thread   
...          ...          ...   
4386       27871       Extend   
4390       27860      Dream3D   
4414       27822     Zenfetch   
4432       27763       Double   
4617       26825   Flair Labs   

                                      short_description  \
2                Stop OpenAI Errors w/ 1 line of code 👈   
9       Open-source web framework for Python developers   
22             Machine learning embeddings as a service   
44         Datadog for machine learning on edge devices   
49     Incident Management platform for large enterp...   
...                                                 ...   
4386         AI-Powered Workflows for Unstructured Data   
4390                               AI-powered 3D Design   
4414  Real time call intelligence for technical sale...   
4432       Th

In [60]:
def scrape_website(url, company_name):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        texts = soup.stripped_strings
        # Print the company name
        print(f"Scraping website for {company_name}")
        return ' '.join(texts)
    except requests.RequestException:
        return 'Failed to retrieve'




ai_companies['Scraped_Info'] = ai_companies.apply(lambda x: scrape_website(x['website'], x['company_name']), axis=1)

# # Create a new column for the scraped information
# ai_companies['Scraped_Info'] = ai_companies['website'].progress_apply(scrape_website)

Scraping website for Atri Labs
Scraping website for Metal
Scraping website for BabylonAI
Scraping website for Thread
Scraping website for MagnaPlay
Scraping website for Waveline
Scraping website for UpTrain AI
Scraping website for Defog.ai
Scraping website for 222
Scraping website for Hadrius
Scraping website for Fintool
Scraping website for Orchid
Scraping website for Booth AI
Scraping website for Wild Moose
Scraping website for EzDubs
Scraping website for Magicflow 𐂂
Scraping website for Anarchy
Scraping website for langfuse
Scraping website for Chart
Scraping website for Buildt
Scraping website for Bluebirds
Scraping website for Magik
Scraping website for PoplarML
Scraping website for JustPaid.io
Scraping website for Rubber Ducky Labs
Scraping website for pyq
Scraping website for Linum
Scraping website for Layup
Scraping website for Chima
Scraping website for AlphaWatch AI
Scraping website for Ivy
Scraping website for Polymath Robotics
Scraping website for Integrated Reasoning
Scrap

In [34]:
scraped_ai  = ai_companies

# Filter out rows where 'Scraped_Info' column contains 'Failed to retrieve'
scraped_ai = scraped_ai[(scraped_ai['Scraped_Info'] != 'Failed to retrieve') & (scraped_ai['Scraped_Info'].notna())]


In [55]:
scraped_ai = scraped_ai.drop_duplicates(subset=['company_name'])
print(scraped_ai)

# Save the updated dataset
scraped_ai.to_csv('scraped_ai.csv', index=False)

      company_id company_name  \
9          28367    Atri Labs   
22         28183        Metal   
44         28114    BabylonAI   
49         28089       Thread   
72         27989    MagnaPlay   
...          ...          ...   
4382       27874       Squack   
4386       27871       Extend   
4414       27822     Zenfetch   
4432       27763       Double   
4617       26825   Flair Labs   

                                      short_description  \
9       Open-source web framework for Python developers   
22             Machine learning embeddings as a service   
44         Datadog for machine learning on edge devices   
49     Incident Management platform for large enterp...   
72                         AI-powered game translation.   
...                                                 ...   
4382         Natural language RPA tools for accountants   
4386         AI-Powered Workflows for Unstructured Data   
4414  Real time call intelligence for technical sale...   
4432       Th

## Modifications and data cleaning

In [56]:
filtered_data = scraped_ai[scraped_ai['Scraped_Info'].str.len() >= 100]

In [57]:
filtered_data

Unnamed: 0,company_id,company_name,short_description,long_description,batch,status,tags,location,country,year_founded,num_founders,founders_names,team_size,website,cb_url,linkedin_url,Scraped_Info
9,28367,Atri Labs,Open-source web framework for Python developers,Atri Labs is Vercel for Python developers. \r\...,W23,Active,"[Artificial Intelligence, Developer Tools, Ope...",San Francisco,US,2022.0,2,"['Darshita Chaturvedi', 'Shyam Swaroop']",2.0,https://atrilabs.com,,https://www.linkedin.com/company/atri-labs/,AI Insurance by Atri Insurance Solutions Polic...
22,28183,Metal,Machine learning embeddings as a service,Metal does machine learning embeddings as a se...,W23,Active,"[Artificial Intelligence, Machine Learning, B2...",New York,US,2023.0,3,"['Taylor Lowe', ""James O'Dwyer"", 'Sergio Prada']",3.0,https://getmetal.io/,,https://www.linkedin.com/company/getmetal/about/,AI for Financial Analysts | Metal metal. Secur...
49,28089,Thread,Incident Management platform for large enterp...,Thread is an incident management tool for larg...,W23,Active,"[Artificial Intelligence, Developer Tools, B2B...",San Francisco,US,2023.0,3,"['Yuheng Wang', 'Harsha Vankayalapati', 'Akeem...",3.0,http://www.usethread.io,https://www.crunchbase.com/organization/thread...,https://www.linkedin.com/company/thread-incorp...,Customer Success Launch Home Contact The Lifel...
72,27989,MagnaPlay,AI-powered game translation.,MagnaPlay is game translation simplified into ...,W23,Active,"[Artificial Intelligence, SaaS, Gaming]",San Francisco,US,2023.0,2,"['Pedro Esteves', 'Paulo Rodrigues']",3.0,https://magnaplay.com,,https://www.linkedin.com/company/magnaplay/,MagnaPlay Localizing games should be child's p...
92,27966,UpTrain AI,Open source Datadog for Machine Learning,UpTrain is an open-source tool to improve AI m...,W23,Active,"[Artificial Intelligence, Developer Tools, Gen...",San Francisco,US,2022.0,2,"['Sourabh Agrawal', 'Shikha Mohanty']",4.0,https://uptrain.ai/,,https://www.linkedin.com/company/uptrain-ai,UpTrain | Full-Stack LLMOps Platform 📝 Read ou...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4218,28743,Leafpress,ESG Software for Real Estate,Leafpress simplifies ESG for real estate portf...,S23,Active,"[Artificial Intelligence, Real Estate, B2B, Pr...",San Francisco,US,2023.0,2,"['Devishi Jha', 'Jianna Liu']",2.0,https://www.leafpress.io,,,Leafpress Enterprise solution Team Why us FAQ ...
4243,28301,Persana AI,Intelligent sales copilot powered by fine tune...,Persana AI is an intelligent sales copilot pow...,W23,Active,"[Artificial Intelligence, Generative AI, Sales]",San Francisco,US,2023.0,2,"['Sriya Maram', 'Rush Shahani']",2.0,https://www.persana.ai/,,https://www.linkedin.com/company/persana-ai/?v...,Persana AI | Supercharge your Prospecting with...
4382,27874,Squack,Natural language RPA tools for accountants,Squack gives accounting firms a RPA tool that ...,W23,Active,"[Artificial Intelligence, Developer Tools, Saa...",San Francisco,US,2022.0,3,"['Preston Zhou', 'Elijah Sorey', 'Eitan Borgnia']",3.0,http://squack.io,,,Computron by Squack Blog Build data workflows ...
4432,27763,Double,The IDE for prompt based no-code automations,Double is an IDE for prompt based no-code auto...,W23,Active,"[Artificial Intelligence, Robotic Process Auto...","Redwood City, CA",US,2022.0,2,"['Wesley Yue', 'Gonzalo Espinoza Graham']",2.0,https://usedouble.com,,https://www.linkedin.com/company/double-ai,Double | Find and convert leads with hyper-tar...


In [58]:
filtered_data.to_csv('filtered_data.csv', index=False)