In [1]:
#insert here Youtube Api key
API_KEY = ...

In [75]:
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import pandas as pd

In [9]:
# Set up the YouTube API client
youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

In [13]:
from urllib.error  import HTTPError
def search_videos_by_query(query_, results_num=100):
  max_results = results_num
  videos = []
  next_page_token = None

  print(query_)

  while len(videos) < max_results:
      request = youtube.search().list(
          q=query_,
          part='id,snippet',
          type='video',
          maxResults=min(50, max_results - len(videos)),  # Set the maximum results per page
          pageToken=next_page_token
      )
      response = request.execute()
      videos += response['items']
      next_page_token = response.get('nextPageToken')

      if not next_page_token:
          break

  return videos


In [14]:
# Search for videos related to Y Combinator applications
def search_ycombinator_videos(batches = ("W22", "S22", "W23", "S23")):
    res = {batch:[] for batch in batches}

    for batch in batches:
      for query_start in ["YC application",
                          "Y combinator application",
                          "YC",
                          "Y Combinator"]:

        query = query_start + " " + batch
        videos = search_videos_by_query(query, results_num=200)
        res[batch] += videos
    return res

In [1]:
# season_to_videos = search_ycombinator_videos()

In [1]:
import json

with open('season_to_videos.json', 'r') as json_file:
    season_to_videos = json.load(json_file)

In [90]:
videos = []
for key, val in season_to_videos.items():
    videos += val

In [91]:
def longest_common_substring(str1, str2):
    m = len(str1)
    n = len(str2)

    # Create a table to store the lengths of common suffixes
    table = [[0] * (n + 1) for _ in range(m + 1)]

    # Variables to store the length and end position of the longest common substring
    max_length = 0
    end_position = 0

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i - 1] == str2[j - 1]:
                table[i][j] = table[i - 1][j - 1] + 1
                if table[i][j] > max_length:
                    max_length = table[i][j]
                    end_position = i - 1
            else:
                table[i][j] = 0

    # Extract the longest common substring
    longest_substring = str1[end_position - max_length + 1:end_position + 1]

    return longest_substring

In [92]:
import string

STOP_WORDS = ["W22",
              "S22",
              "W23",
              "S23",
              "YC",
              "Y",
              "application",
              "combinator",
              "Application",
              "Combinator",
              "video",
              "Video"]

def parse_batch(title, descr):
  batches = ("W22", "S22", "W23", "S23")
  for b in batches:
    if b in (title + descr):
      return b
  
  for year in ("22", "23"):
    for season in ("Summer", "Winter", "S", "W"):
      if year in title + descr and season in title + descr:
        return season[0] + year
  
  return "Unk"


def get_data_from_videos(videos):
  processed_videos = {"id":[],
                      "title":[],
                      "link":[],
                      "description":[],
                      "channel_title":[],
                      "publish_time":[],
                      "batch":[]
                      }
  for video in videos:
    if video['id']['videoId'] not in processed_videos["id"]:
      processed_videos["id"].append(video['id']['videoId'])
      processed_videos["link"].append("https://www.youtube.com/watch?v=" + video['id']['videoId'])
      processed_videos["title"].append(video['snippet']['title'])
      processed_videos["description"].append(video['snippet']['description'])
      processed_videos["channel_title"].append(video['snippet']['channelTitle'])
      processed_videos["publish_time"].append(video['snippet']['publishTime'])
      processed_videos["batch"].append(parse_batch(video['snippet']['title'], video['snippet']['description']))
  return processed_videos

In [93]:
import pandas as pd

videos_data = pd.DataFrame(get_data_from_videos(videos))

In [112]:
yc_data = pd.read_json('combined_companies_data.json')

Get successful applications

In [349]:
def check_name(row):
    if row['name'].lower() in row['all_info'].lower():
        return True
    
    for name in row['former_names']:
        if name.lower() in row['all_info'].lower():
            return True
    
    return False

videos_data["all_info"] = videos_data.apply(lambda row: ' '.join([row["title"], row["channel_title"]]), axis=1)
videos_data['join'] = 1
yc_data['join'] = 1
success_applications = pd.merge(videos_data, yc_data, on='join', how='inner')
success_applications['is_substr'] = success_applications.apply(check_name, axis=1)
result = success_applications[success_applications['is_substr']]
result = result.drop(columns=['is_substr', 'join'])
result = result[result['name'] != 'Y Combinator']
result = result[result['channel_title'] != 'Y Combinator']

In [401]:
result = result[result['batch_x'] == result['batch_y']]

In [405]:
result.to_excel("accepted_videos.xlsx")

Get unsuccessful applications

In [203]:
accepted = set(result['channel_title'].to_list())

In [308]:
not_accepted = videos_data[videos_data['channel_title'].apply(lambda x: x not in accepted)]

In [309]:
not_accepted = not_accepted[not_accepted['batch'] != 'Unk']

In [310]:
import string

words_to_remove = ["yc",
                   "combinator",
                   "y",
                   "c",
                   "demo",
                   "video",
                   "application",
                   "for",
                   "founders",
                   "team",
                   "pitch"]
for season in ["s", "w", "", "summer", "winter"]:
    for year in ["22", "23", "2022", "2023", ""]:
        words_to_remove.append(season+year)

def proces_title(title):
    title = ''.join([s for s in title if s not in string.punctuation])
    company_name = []
    for word in title.split():
        if word.lower() not in words_to_remove and len(word) >= 3:
            company_name.append(word)
    if len(company_name) > 3:
        return ""
    return ' '.join(company_name)
            


In [311]:
not_accepted['name'] = not_accepted['title'].apply(proces_title)

In [312]:
not_accepted = not_accepted[not_accepted.apply(lambda row: len(row['name']) >= 3, axis=1)]

In [313]:
not_accepted.to_excel("not_accepted.xlsx")

Not accepted with website in description

In [314]:
not_accepted = not_accepted[not_accepted.apply(lambda row: ".com" in row['description'] or "www" in row['description'] or "http" in row['description'], axis=1)]

In [316]:
not_accepted.to_excel("not_accepted_with_website.xlsx")