# Semiconductor Equipment Market Analysis
## Exploratory Data Analysis of CSET Semiconductor Dataset

This notebook explores market share data for semiconductor equipment providers across different equipment categories and countries.

In [None]:
from owid.catalog import Dataset
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Set plotting style
plt.style.use('ggplot')
%matplotlib inline

## Load Data

In [None]:
# Load garden dataset (has harmonized country names)
ds_garden = Dataset("/Users/veronikasamborska/etl/data/garden/artificial_intelligence/2025-11-05/semiconductors_cset")
df_garden = pd.DataFrame(ds_garden.read("semiconductors_cset").reset_index())

print(f"Garden dataset shape: {df_garden.shape}")
print(f"\nColumns in garden: {df_garden.columns.tolist()}")
df_garden.head(10)

In [None]:
# Use garden data for analysis (has harmonized country names)
df = df_garden.copy()
df.head(10)

## Basic Statistics

In [None]:
print("="*80)
print("DATASET OVERVIEW")
print("="*80)
print(f"Total records: {len(df):,}")
print(f"Unique providers: {df['provider'].nunique()}")
print(f"Unique countries: {df['country'].nunique()}")
print(f"Unique equipment categories: {df['provided_name'].nunique()}")
print(f"Year range: {df['year'].min()} - {df['year'].max()}")
print(f"\nMarket Share Statistics:")
print(df['share_provided'].describe())

## Company Analysis

In [None]:
# Company overview
print("Companies in dataset:")
print(df['provider'].value_counts())

## Visualization 1: Company Market Dominance

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Average market share by company (across categories they participate in)
company_avg_shares = df.groupby('provider')['share_provided'].mean().sort_values(ascending=False).head(15)
company_avg_shares.plot(kind='barh', ax=ax1, color='steelblue')
ax1.set_title('Top 15 Companies by Average Market Share per Category', fontsize=14, fontweight='bold')
ax1.set_xlabel('Average Market Share (%)')
ax1.invert_yaxis()

# Number of equipment categories by company
categories_by_company = df.groupby('provider')['provided_name'].nunique().sort_values(ascending=False).head(15)
categories_by_company.plot(kind='barh', ax=ax2, color='coral')
ax2.set_title('Number of Equipment Categories by Company', fontsize=14, fontweight='bold')
ax2.set_xlabel('Number of Categories')
ax2.invert_yaxis()

plt.tight_layout()
plt.show()

## Visualization 2: Equipment Categories

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

top_categories = df['provided_name'].value_counts().head(20)
top_categories.plot(kind='barh', ax=ax, color='purple')
ax.set_title('Top 20 Equipment Categories by Data Points', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Data Points')
ax.invert_yaxis()

plt.tight_layout()
plt.show()

## Visualization 3: Market Concentration

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Distribution of market shares
ax1.hist(df['share_provided'], bins=50, color='green', alpha=0.7, edgecolor='black')
ax1.axvline(df['share_provided'].median(), color='red', linestyle='--', linewidth=2, 
            label=f'Median: {df["share_provided"].median():.1f}%')
ax1.axvline(df['share_provided'].mean(), color='blue', linestyle='--', linewidth=2, 
            label=f'Mean: {df["share_provided"].mean():.1f}%')
ax1.set_title('Distribution of Market Shares', fontsize=14, fontweight='bold')
ax1.set_xlabel('Market Share (%)')
ax1.set_ylabel('Frequency')
ax1.legend()

# Top companies by number of dominant positions (>50% share)
company_dominance = df[df['share_provided'] > 50].groupby('provider').size().sort_values(ascending=False).head(15)
company_dominance.plot(kind='barh', ax=ax2, color='orange')
ax2.set_title('Top 15 Companies by Number of Dominant Positions (>50%)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Number of Dominant Positions')
ax2.invert_yaxis()

plt.tight_layout()
plt.show()

## Monopolistic Positions Analysis

In [None]:
# Find monopolistic positions (>80% market share)
monopolistic = df[df['share_provided'] > 80].copy()

print(f"Number of monopolistic positions (>80% share): {len(monopolistic)}")
print(f"\nMonopolistic positions by company:")
print(monopolistic['provider'].value_counts())

print(f"\nTop monopolistic companies:")
monopolistic.sort_values('share_provided', ascending=False)[['provider', 'country', 'provided_name', 'share_provided', 'year']].head(20)

In [None]:
# Visualize monopolistic positions by company
fig, ax = plt.subplots(figsize=(12, 6))

monopoly_companies = monopolistic['provider'].value_counts().head(15)
monopoly_companies.plot(kind='bar', ax=ax, color='crimson')
ax.set_title('Top 15 Companies with Monopolistic Positions (>80% market share)', fontsize=14, fontweight='bold')
ax.set_ylabel('Number of Monopolistic Markets')
ax.set_xlabel('')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

## Deep Dive: ASML - The Lithography Giant

In [None]:
asml = df[df['provider'] == 'ASML'].sort_values('share_provided', ascending=False)

print("ASML Market Positions:")
print(asml[['provided_name', 'share_provided', 'year']])

fig, ax = plt.subplots(figsize=(10, 6))
asml_sorted = asml.sort_values('share_provided')
ax.barh(range(len(asml_sorted)), asml_sorted['share_provided'], color='darkblue')
ax.set_yticks(range(len(asml_sorted)))
ax.set_yticklabels(asml_sorted['provided_name'])
ax.set_xlabel('Market Share (%)')
ax.set_title('ASML Market Share by Equipment Category', fontsize=14, fontweight='bold')
ax.axvline(100, color='red', linestyle='--', alpha=0.5, label='100% monopoly')
ax.legend()

plt.tight_layout()
plt.show()

## Key Insights Summary

In [None]:
print("="*80)
print("KEY INSIGHTS")
print("="*80)

print("\n1. MARKET CONCENTRATION")
print(f"   - Median market share: {df['share_provided'].median():.1f}%")
print(f"   - Mean market share: {df['share_provided'].mean():.1f}%")
print(f"   - Markets with >50% concentration: {len(df[df['share_provided'] > 50])}")
print(f"   - Markets with >80% concentration: {len(monopolistic)}")

print("\n2. COMPANY DOMINANCE")
print("   - Companies by average market share in categories they compete in:")
top_3_companies = df.groupby('provider')['share_provided'].mean().sort_values(ascending=False).head(3)
for company, avg_share in top_3_companies.items():
    num_categories = df[df['provider'] == company]['provided_name'].nunique()
    company_country = df[df['provider'] == company]['country'].iloc[0]
    print(f"     • {company} ({company_country}): {avg_share:.1f}% average across {num_categories} categories")

print("\n3. TOP COMPANIES BY DOMINANCE")
print("   - Companies with most dominant positions (>50% market share):")
top_3_dominance = df[df['share_provided'] > 50].groupby('provider').size().sort_values(ascending=False).head(3)
for company, count in top_3_dominance.items():
    company_country = df[df['provider'] == company]['country'].iloc[0]
    print(f"     • {company} ({company_country}): {count} dominant positions")

print("\n4. CRITICAL DEPENDENCIES")
euv_share = df[(df['provider'] == 'ASML') & (df['provided_name'] == 'EUV lithography tools')]['share_provided'].values
if len(euv_share) > 0:
    print(f"   - EUV lithography: ASML has {euv_share[0]:.0f}% market share")
print(f"   - Netherlands-based ASML controls critical lithography technology")
print(f"   - High concentration in advanced packaging and lithography tools")

## Your Turn: Explore Further

Use the cells below to explore specific questions:
- Which countries dominate specific equipment categories?
- How concentrated is the market for critical technologies?
- What is the competitive landscape for specific tools?

In [None]:
# Example: Explore a specific equipment category
category = 'EUV lithography tools'
category_data = df[df['provided_name'] == category]
print(f"\nMarket share for {category}:")
print(category_data[['provider', 'country', 'share_provided', 'year']])

In [None]:
# Your exploration here
