In [None]:
!pip install datasets


In [None]:
from datasets import load_dataset

# Load the dataset in streaming mode
dataset = load_dataset("vevotx/Tahoe-100M", split="train", streaming=True)

# Convert the iterable dataset into an iterator
dataset_iter = iter(dataset)

# Retrieve the first 1000 records
subset_size = 1000
subset = [next(dataset_iter) for _ in range(subset_size)]


In [None]:
import pandas as pd

# Create a DataFrame from the subset
df_subset = pd.DataFrame(subset)

# Display basic information
print(df_subset.info())
print(df_subset.head())


In [None]:
# Load 100 samples and collect unique gene IDs
gene_set = set()
for _ in range(100):
    row = next(dataset_iter)
    gene_set.update(row['genes'])

# Convert to list and show a few examples
gene_list = list(gene_set)
print("Sample gene IDs:", gene_list[:10])


In [None]:
# Load gene metadata table using streaming
gene_meta = load_dataset("vevotx/Tahoe-100M", name="gene_metadata", split="train", streaming=True)
gene_meta_iter = iter(gene_meta)

# Peek at the first item to see what keys are available
first_item = next(gene_meta_iter)
print("Available keys:", first_item.keys())

# Reset the iterator
gene_meta_iter = iter(gene_meta)

# Based on the available keys, use the correct key name for gene ID
# (assuming there is some ID field with a different name)
gene_id_map = {}
id_field = 'id'  # Replace with the actual ID field found in the keys

for _ in range(100):
    item = next(gene_meta_iter)
    gene_id_map[item[id_field]] = item  # Maps numeric ID to metadata

# Show a few mapped gene names
for gid in list(gene_id_map.keys())[:4]:  # Just show the first 4 we collected
    gene_info = gene_id_map.get(gid, {})
    print(f"Gene ID {gid}: {gene_info}")

In [None]:
# Load gene metadata table using streaming
gene_meta = load_dataset("vevotx/Tahoe-100M", name="gene_metadata", split="train", streaming=True)
gene_meta_iter = iter(gene_meta)

# Now we know the available keys: 'gene_symbol', 'ensembl_id', 'token_id'
# Let's use 'ensembl_id' as our mapping key
gene_id_map = {}

for _ in range(100):
    item = next(gene_meta_iter)
    gene_id_map[item['ensembl_id']] = item  # Maps ensembl_id to metadata

# Show a few mapped genes
# We need to get the actual keys from our map since we don't know the values in advance
keys_list = list(gene_id_map.keys())
for i, ensembl_id in enumerate(keys_list[:4]):  # Show first 4 entries
    gene_info = gene_id_map[ensembl_id]
    print(f"Entry {i+1}:")
    print(f"  Ensembl ID: {ensembl_id}")
    print(f"  Gene Symbol: {gene_info['gene_symbol']}")
    print(f"  Token ID: {gene_info['token_id']}")
    print()

In [None]:
# Load gene metadata table using streaming
gene_meta = load_dataset("vevotx/Tahoe-100M", name="gene_metadata", split="train", streaming=True)
gene_meta_iter = iter(gene_meta)

# Now we know the available keys: 'gene_symbol', 'ensembl_id', 'token_id'
# Let's use 'ensembl_id' as our mapping key
gene_id_map = {}

for _ in range(100):
    item = next(gene_meta_iter)
    gene_id_map[item['ensembl_id']] = item  # Maps ensembl_id to metadata

# Show a few mapped genes
# We need to get the actual keys from our map since we don't know the values in advance
keys_list = list(gene_id_map.keys())
for i, ensembl_id in enumerate(keys_list[:4]):  # Show first 4 entries
    gene_info = gene_id_map[ensembl_id]
    print(f"Entry {i+1}:")
    print(f"  Ensembl ID: {ensembl_id}")
    print(f"  Gene Symbol: {gene_info['gene_symbol']}")
    print(f"  Token ID: {gene_info['token_id']}")
    print()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data from the gene entries
ensembl_ids = ['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', 'ENSG00000000457']
gene_symbols = ['TSPAN6', 'TNMD', 'DPM1', 'SCYL3']
token_ids = [3, 4, 5, 6]

# Create figure and axes
fig, ax = plt.subplots(figsize=(10, 6))

# Create horizontal bar chart
y_pos = np.arange(len(gene_symbols))
ax.barh(y_pos, token_ids, align='center', alpha=0.7, color='skyblue')
ax.set_yticks(y_pos)
ax.set_yticklabels(gene_symbols)

# Add Ensembl IDs as text beside the gene symbols
for i, ensembl in enumerate(ensembl_ids):
    ax.text(-0.5, i, ensembl, ha='right', va='center', fontsize=8, color='gray')

# Add token ID values at the end of each bar
for i, token in enumerate(token_ids):
    ax.text(token + 0.1, i, str(token), ha='left', va='center')

# Set labels and title
ax.set_xlabel('Token ID')
ax.set_title('Gene Token ID Visualization')

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Replace with your gene of interest
gene_of_interest = 'ENSG00000000419'  # DPM1

# First, we need to find this gene's token_id to match it in the expression data
# Assuming you've loaded the gene metadata and created a mapping
gene_token_id = None
for ensembl_id, info in gene_id_map.items():
    if ensembl_id == gene_of_interest:
        gene_token_id = str(info['token_id'])
        gene_symbol = info['gene_symbol']
        break

if gene_token_id:
    # Filter for entries that contain the gene token ID
    gene_expressions = df_subset[df_subset['genes'].apply(lambda x: gene_token_id in x)]

    # Extract expression values
    expression_values = []
    for idx, row in gene_expressions.iterrows():
        try:
            gene_idx = row['genes'].index(gene_token_id)
            expression_values.append(row['expressions'][gene_idx])
        except (ValueError, IndexError):
            continue

    # Plot the distribution
    plt.figure(figsize=(10, 6))
    plt.hist(expression_values, bins=30, color='skyblue', edgecolor='black')
    plt.title(f'Expression Distribution of {gene_symbol} ({gene_of_interest})')
    plt.xlabel('Expression Level')
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)
    plt.show()
else:
    print(f"Gene {gene_of_interest} not found in metadata.")

In [None]:
import matplotlib.pyplot as plt

# Replace 'gene_of_interest' with the actual gene identifier
gene_of_interest = '6'

# Filter for non-zero expressions of the gene
gene_expressions = df_subset[df_subset['genes'].apply(lambda x: gene_of_interest in x)]

# Extract expression values
expression_values = gene_expressions['expressions'].apply(lambda x: x[x.index(gene_of_interest)])

# Plot the distribution
plt.hist(expression_values, bins=30)
plt.title(f'Expression Distribution of {gene_of_interest}')
plt.xlabel('Expression Level')
plt.ylabel('Frequency')
plt.show()
