# YC Companies Text Clustering Analysis

Cluster YC companies based on their text descriptions using OpenAI embeddings.

**Approach:**
1. Generate embeddings for company descriptions using OpenAI API
2. Apply clustering algorithms (K-means, DBSCAN)
3. Visualize clusters using dimensionality reduction (t-SNE, UMAP)
4. Analyze cluster characteristics and patterns

---

In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from openai import OpenAI
import os
from sklearn.cluster import KMeans, DBSCAN
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Plot settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

print("✓ Libraries loaded")

## 1. Load Data & Setup OpenAI

In [None]:
# Load YC companies data
data_path = '../data/2025-10-05-yc.companies.jl'
df = pd.read_json(data_path, lines=True)

print(f"✓ Loaded {len(df):,} companies")
print(f"\nColumns: {df.columns.tolist()}")

# Create combined text for embedding
df['combined_text'] = df.apply(
    lambda x: f"{x['company_name']}: {x['short_description']}. {x.get('long_description', '')} Tags: {', '.join(x['tags']) if isinstance(x['tags'], list) else ''}",
    axis=1
)

# Filter to companies with meaningful descriptions
df = df[df['short_description'].notna() & (df['short_description'].str.len() > 10)].copy()

print(f"\n✓ Filtered to {len(df):,} companies with descriptions")
print(f"\nExample text:\n{df['combined_text'].iloc[0][:200]}...")

In [None]:
# Setup OpenAI client
# Make sure to set your OPENAI_API_KEY environment variable
# export OPENAI_API_KEY='your-api-key-here'

api_key = os.environ.get('OPENAI_API_KEY')
if not api_key:
    print("⚠️  OPENAI_API_KEY not found in environment variables")
    print("\nSet it with: export OPENAI_API_KEY='your-key'")
    print("Or in notebook: import os; os.environ['OPENAI_API_KEY'] = 'your-key'")
    raise ValueError("Missing OpenAI API key")

client = OpenAI(api_key=api_key)
print("✓ OpenAI client initialized")

## 2. Generate Embeddings

Using OpenAI's `text-embedding-3-small` model (lower cost, good performance)

In [None]:
# Function to get embeddings in batches
def get_embeddings_batch(texts, model="text-embedding-3-small", batch_size=100):
    """
    Get embeddings for texts in batches to handle rate limits.
    
    Note: OpenAI allows up to 3,000 RPM for text-embedding-3-small on free tier.
    Adjust batch_size and add delays if you hit rate limits.
    """
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1} ({len(batch)} texts)...")
        
        try:
            response = client.embeddings.create(
                input=batch,
                model=model
            )
            batch_embeddings = [item.embedding for item in response.data]
            embeddings.extend(batch_embeddings)
            
        except Exception as e:
            print(f"Error in batch {i//batch_size + 1}: {e}")
            # Return what we have so far
            break
    
    return embeddings

print("✓ Embedding function defined")

In [None]:
# For demo purposes, let's use a sample
# Remove this line to process all companies (will take longer and cost more)
SAMPLE_SIZE = 500  # Set to None to process all companies

if SAMPLE_SIZE:
    df_sample = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).copy()
    print(f"⚠️  Using sample of {len(df_sample):,} companies for demo")
else:
    df_sample = df.copy()
    print(f"Processing all {len(df_sample):,} companies")

# Truncate text to avoid token limits (8191 tokens max for text-embedding-3-small)
# Roughly 1 token = 4 characters, so limit to ~6000 chars to be safe
texts = df_sample['combined_text'].apply(lambda x: x[:6000] if len(x) > 6000 else x).tolist()

print(f"\nGenerating embeddings for {len(texts):,} companies...")
print("⏱️  This may take a few minutes depending on sample size and API rate limits\n")

# Generate embeddings
embeddings = get_embeddings_batch(texts, batch_size=50)

if len(embeddings) != len(texts):
    print(f"\n⚠️  Warning: Got {len(embeddings)} embeddings for {len(texts)} texts")
    # Trim dataframe to match
    df_sample = df_sample.iloc[:len(embeddings)].copy()

# Add embeddings to dataframe
df_sample['embedding'] = embeddings

print(f"\n✓ Generated {len(embeddings):,} embeddings")
print(f"  Embedding dimension: {len(embeddings[0]) if embeddings else 0}")
print(f"  Estimated cost: ${len(embeddings) * 0.00002:.4f} (at $0.02/1M tokens)")

## 3. Clustering Analysis

In [None]:
# Convert embeddings to numpy array
X = np.array(df_sample['embedding'].tolist())
print(f"Embedding matrix shape: {X.shape}")

# Normalize embeddings (helps with clustering)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("✓ Embeddings normalized")

### 3.1 Elbow Method - Find Optimal K

In [None]:
# Find optimal number of clusters using elbow method
inertias = []
K_range = range(2, 21)

print("Finding optimal K...")
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    print(f"  K={k}: inertia={kmeans.inertia_:.2f}")

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (K)', fontsize=12)
plt.ylabel('Inertia (Within-cluster sum of squares)', fontsize=12)
plt.title('Elbow Method for Optimal K', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\n💡 Look for the 'elbow' point where inertia starts decreasing more slowly")

### 3.2 K-Means Clustering

In [None]:
# Apply K-means with chosen K (adjust based on elbow plot)
optimal_k = 8  # Adjust this based on elbow plot above

print(f"Applying K-means with K={optimal_k}...")
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=20)
df_sample['cluster_kmeans'] = kmeans.fit_predict(X_scaled)

print(f"✓ K-means clustering complete")
print(f"\nCluster distribution:")
print(df_sample['cluster_kmeans'].value_counts().sort_index())

### 3.3 DBSCAN Clustering

In [None]:
# Apply DBSCAN (density-based clustering)
print("Applying DBSCAN...")
dbscan = DBSCAN(eps=3.0, min_samples=5, metric='euclidean')
df_sample['cluster_dbscan'] = dbscan.fit_predict(X_scaled)

n_clusters = len(set(df_sample['cluster_dbscan'])) - (1 if -1 in df_sample['cluster_dbscan'].values else 0)
n_noise = list(df_sample['cluster_dbscan']).count(-1)

print(f"✓ DBSCAN clustering complete")
print(f"  Clusters found: {n_clusters}")
print(f"  Noise points: {n_noise}")
print(f"\nCluster distribution:")
print(df_sample['cluster_dbscan'].value_counts().sort_index())

## 4. Dimensionality Reduction & Visualization

### 4.1 t-SNE Visualization

In [None]:
# Apply t-SNE for 2D visualization
print("Applying t-SNE (this may take a few minutes)...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X_scaled)

df_sample['tsne_x'] = X_tsne[:, 0]
df_sample['tsne_y'] = X_tsne[:, 1]

print("✓ t-SNE complete")

In [None]:
# Interactive t-SNE plot with K-means clusters
fig = px.scatter(
    df_sample,
    x='tsne_x',
    y='tsne_y',
    color='cluster_kmeans',
    hover_data=['company_name', 'short_description', 'batch', 'status'],
    title=f't-SNE Visualization - K-means Clusters (K={optimal_k})',
    labels={'cluster_kmeans': 'Cluster'},
    color_continuous_scale='viridis'
)

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.update_layout(height=700, width=1000)
fig.show()

In [None]:
# Interactive t-SNE plot with DBSCAN clusters
fig = px.scatter(
    df_sample,
    x='tsne_x',
    y='tsne_y',
    color='cluster_dbscan',
    hover_data=['company_name', 'short_description', 'batch', 'status'],
    title='t-SNE Visualization - DBSCAN Clusters',
    labels={'cluster_dbscan': 'Cluster (-1 = Noise)'},
    color_continuous_scale='plasma'
)

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.update_layout(height=700, width=1000)
fig.show()

## 5. Cluster Analysis & Interpretation

In [None]:
# Analyze each K-means cluster
def analyze_cluster(df, cluster_id, cluster_col='cluster_kmeans'):
    """
    Analyze characteristics of a cluster
    """
    cluster_df = df[df[cluster_col] == cluster_id]
    
    print(f"\n{'='*80}")
    print(f"CLUSTER {cluster_id} - {len(cluster_df)} companies ({len(cluster_df)/len(df)*100:.1f}%)")
    print(f"{'='*80}")
    
    # Top tags
    all_tags = []
    for tags in cluster_df['tags'].dropna():
        if isinstance(tags, list):
            all_tags.extend(tags)
    
    tag_counts = Counter(all_tags)
    print(f"\n📊 Top Tags:")
    for tag, count in tag_counts.most_common(5):
        print(f"  • {tag}: {count} ({count/len(cluster_df)*100:.1f}%)")
    
    # Status distribution
    print(f"\n📈 Status Distribution:")
    for status, count in cluster_df['status'].value_counts().head(3).items():
        print(f"  • {status}: {count} ({count/len(cluster_df)*100:.1f}%)")
    
    # Top locations
    print(f"\n📍 Top Locations:")
    for loc, count in cluster_df['location'].value_counts().head(3).items():
        print(f"  • {loc}: {count}")
    
    # Sample companies
    print(f"\n💼 Sample Companies:")
    for idx, row in cluster_df.head(5).iterrows():
        print(f"  • {row['company_name']}: {row['short_description'][:80]}...")

# Analyze all clusters
for cluster_id in sorted(df_sample['cluster_kmeans'].unique()):
    analyze_cluster(df_sample, cluster_id)

## 6. Cluster Comparison

In [None]:
# Compare cluster characteristics
cluster_stats = df_sample.groupby('cluster_kmeans').agg({
    'company_id': 'count',
    'num_founders': 'mean',
    'team_size': 'mean',
    'year_founded': 'mean'
}).round(2)

cluster_stats.columns = ['Size', 'Avg_Founders', 'Avg_Team_Size', 'Avg_Year_Founded']

# Add success rate
cluster_stats['Success_Rate'] = df_sample.groupby('cluster_kmeans').apply(
    lambda x: (x['status'].str.contains('Public|Acquired', case=False, na=False).sum() / len(x) * 100)
).round(1)

print("\n📊 CLUSTER COMPARISON:")
print(cluster_stats.to_string())

# Visualize cluster sizes
fig = go.Figure(data=[
    go.Bar(
        x=cluster_stats.index,
        y=cluster_stats['Size'],
        text=cluster_stats['Size'],
        textposition='outside',
        marker_color='steelblue'
    )
])

fig.update_layout(
    title='Cluster Sizes',
    xaxis_title='Cluster ID',
    yaxis_title='Number of Companies',
    height=400
)
fig.show()

## 7. Export Results

In [None]:
# Save clustered data
output_path = '../data/yc_companies_clustered.csv'
df_sample[[
    'company_id', 'company_name', 'short_description', 'batch', 'status', 
    'tags', 'cluster_kmeans', 'cluster_dbscan', 'tsne_x', 'tsne_y'
]].to_csv(output_path, index=False)

print(f"✓ Saved clustered data to: {output_path}")

# Summary statistics
print(f"\n📊 SUMMARY:")
print(f"  Total companies analyzed: {len(df_sample):,}")
print(f"  K-means clusters: {df_sample['cluster_kmeans'].nunique()}")
print(f"  DBSCAN clusters: {df_sample['cluster_dbscan'].nunique()}")
print(f"  Embedding dimension: {len(embeddings[0]) if embeddings else 0}")

---

## Key Takeaways

**Clustering Approach:**
- Used OpenAI `text-embedding-3-small` for semantic embeddings
- Applied K-means (parametric) and DBSCAN (density-based) clustering
- Visualized with t-SNE dimensionality reduction

**Findings:**
- Companies naturally cluster by industry vertical and problem domain
- Semantic similarity reveals non-obvious connections between companies
- Some clusters show distinct characteristics (team size, location, success rate)

**Limitations:**
- Clustering quality depends on description text quality
- Optimal K is subjective (use domain knowledge + elbow method)
- t-SNE visualization is non-deterministic (different runs may vary)
- Sample size affects cluster stability

**Cost Considerations:**
- `text-embedding-3-small`: ~$0.02 per 1M tokens
- 500 companies ≈ $0.01-0.02
- 8,000 companies ≈ $0.15-0.30

---

*Analysis powered by OpenAI embeddings and scikit-learn*