In [None]:
import sys
from firecloud import fiss
from google.cloud import storage
import firecloud.api as fapi
import os
import io
import json

# Scientific computing in python
import numpy as np
# Data visualizations
import matplotlib.pyplot as plt
# Data analysis tools and data structures like the DataFrame
import pandas as pd
# Statistical data visualization, site: https://seaborn.pydata.org/
import seaborn as sns

# Get the Google billing project name and workspace name
project = os.environ['WORKSPACE_NAMESPACE']
workspace = os.environ['WORKSPACE_NAME']
bucket = os.environ['WORKSPACE_BUCKET'] + "/"
google_project = os.environ['GOOGLE_PROJECT']

# Verify that we've captured the environment variables
print("Terra Billing project: " + project)
print("Workspace: " + workspace)
print("Workspace storage bucket: " + bucket)
print("Google project: " + google_project)

In [None]:
from google.cloud import storage

def list_gcs_files(bucket_name, prefix):
    # Initialize a client
    client = storage.Client()

    # Get the bucket
    bucket = client.get_bucket(bucket_name)

    # List blobs in the specified directory
    blobs = bucket.list_blobs(prefix=prefix)

    for blob in blobs:
        print(blob.name)

# Replace with your bucket name and directory prefix
bucket_name = 'fc-xxxxxx-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx'
prefix = 'working-set-redeposit'

list_gcs_files(bucket_name, prefix)

In [None]:
def create_folder(bucket_name, destination_folder_name):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_folder_name)

    blob.upload_from_string('')

    print('Created {} .'.format(
        destination_folder_name))

In [None]:
import hail as hl

In [None]:
hl.init(default_reference='GRCh38', idempotent=True)

In [None]:
# Convert the relatedness matrix to a table
relatedness_table = plink_relatedness.to_pandas()
relatedness_table.to_csv('gs://fc-secure-540f27be-97ea-4ffd-adb7-c195458eb278/relatedness_testing/plink_output_relatedness_table.txt', index = False)

In [None]:
bucket_name = 'fc-secure-540f27be-97ea-4ffd-adb7-c195458eb278'
prefix = 'relatedness_testing/subset_163k'

list_gcs_files(bucket_name, prefix)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import gcsfs

# Define the GCS bucket and file path
bucket_name = 'fc-secure-540f27be-97ea-4ffd-adb7-c195458eb278'
prefix = 'relatedness_testing/subset_163k/relatedness_output.genome'
url = f'{bucket_name}/{prefix}'

# Create a GCS filesystem object (works in Terra notebooks)
fs = gcsfs.GCSFileSystem(token='cloud')

# Load a manageable chunk of the .genome file
with fs.open(url) as f:
    df = pd.read_csv(f, sep=r'\s+', nrows=1_000_000)  # PLINK genome files are space-delimited

# Check column names
print("Columns:", df.columns.tolist())
print(df.head())

# ------------------------------------------------------------
# Plot distribution of PI_HAT
plt.figure(figsize=(10, 6))
plt.hist(df['PI_HAT'], bins=50, edgecolor='black')
plt.title('Distribution of PI_HAT (Proportion of IBD)')
plt.xlabel('PI_HAT')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Scatter: Z0 vs Z1
plt.figure(figsize=(10, 6))
plt.scatter(df['Z0'], df['Z1'], alpha=0.5)
plt.title('Scatter Plot of Z0 vs Z1')
plt.xlabel('Z0')
plt.ylabel('Z1')
plt.grid(True)
plt.show()

# Scatter: Z0 vs Z2
plt.figure(figsize=(10, 6))
plt.scatter(df['Z0'], df['Z2'], alpha=0.5)
plt.title('Scatter Plot of Z0 vs Z2')
plt.xlabel('Z0')
plt.ylabel('Z2')
plt.grid(True)
plt.show()

# ------------------------------------------------------------
# Relationship categorization based on PI_HAT
def categorize_relationship(row):
    pi_hat = row['PI_HAT']
    if pi_hat > 0.9:
        return 'Twins/Duplicates'
    elif 0.4 < pi_hat <= 0.6:
        return 'First-degree'
    elif 0.2 < pi_hat <= 0.4:
        return 'Second-degree'
    elif 0.1 < pi_hat <= 0.2:
        return 'Third-degree'
    else:
        return 'Unrelated'

df['relationship'] = df.apply(categorize_relationship, axis=1)

# Get unique individuals in each category
def unique_individuals_in_category(df, category):
    subset = df[df['relationship'] == category]
    unique_individuals = pd.unique(subset[['IID1', 'IID2']].values.ravel('K'))
    return set(unique_individuals)

categories = ['Twins/Duplicates', 'First-degree', 'Second-degree', 'Third-degree']
unique_individuals_dict = {cat: unique_individuals_in_category(df, cat) for cat in categories}

# Compute unrelated individuals
all_related = set().union(*unique_individuals_dict.values())
all_individuals = set(df['IID1']).union(df['IID2'])
unrelated_individuals = all_individuals - all_related
unique_individuals_dict['Unrelated'] = unrelated_individuals

# Count per category
unique_counts = {cat: len(inds) for cat, inds in unique_individuals_dict.items()}

# Print summary
for category, count in unique_counts.items():
    print(f'{category}: {count} unique individuals')

# Barplot
plt.figure(figsize=(10, 6))
plt.bar(unique_counts.keys(), unique_counts.values(), edgecolor='black')
plt.title('Unique Individuals in Relationship Categories')
plt.xlabel('Relationship Type')
plt.ylabel('Count')
plt.grid(True)
plt.show()

In [None]:
# Convert counts dictionary to a DataFrame
counts_df = pd.DataFrame.from_dict(unique_counts, orient='index', columns=['Number of Unique Individuals'])
counts_df.index.name = 'Relationship Category'
counts_df = counts_df.reset_index()

# Print table
print("\nSummary Table: Unique Individuals per Relationship Category")
print(counts_df.to_string(index=False))