In [1]:
import pandas as pd
import numpy as np
import pickle
import json

from sklearn.manifold import TSNE
import plotly.express as px

# Common Metadata

In [2]:
tcga_oncontree = pd.read_csv('/home/fcarli/francisCelligner/tcga_oncotree_data_correct.csv')
ccle_oncontree = pd.read_csv('/home/fcarli/CellHit/data/metadata/Model.csv')

In [4]:
with open('/home/fcarli/francisCelligner/tissueMap.json','r') as f:
    mainTypeMap = json.load(f)


# Old Celligner

In [13]:
df = pd.read_feather('/home/fcarli/WebCellHit/data/transcriptomics/celligner_CCLE_TCGA_old.feather')

In [14]:
tcga_oncontree['mainType'] = tcga_oncontree['oncotree_code'].map(mainTypeMap)
ccle_oncontree['mainType'] = ccle_oncontree['OncotreeCode'].map(mainTypeMap)
tcga_oncontree.dropna(inplace=True,subset=['mainType'])
ccle_oncontree.dropna(inplace=True,subset=['mainType'])

tcga_mapper = dict(zip(tcga_oncontree['sample_id'],tcga_oncontree['mainType']))
ccle_mapper = dict(zip(ccle_oncontree['ModelID'],ccle_oncontree['mainType']))

In [15]:
tcga = df[df['Source']=='TCGA']
ccle = df[df['Source']=='CCLE']

In [16]:
#get all columns except the first two
genes = tcga.columns[2:-1]
genes

Index(['KLK13', 'RNF40', 'LRRC25', 'PITPNM2', 'SERF1A', 'CYP26C1', 'POLR3GL',
       'NMNAT3', 'FBXO22', 'PRB2',
       ...
       'EIF4G2', 'GTPBP2', 'GTPBP3', 'ZNF746', 'IQGAP1', 'PDILT', 'EPHA1',
       'RAB27B', 'HECA', 'C12orf71'],
      dtype='object', length=18173)

In [17]:
tcga['oncotree_code'] = tcga['index'].map(tcga_mapper)
ccle['oncotree_code'] = ccle['index'].map(ccle_mapper)
tcga.dropna(inplace=True,subset=['oncotree_code'])   
ccle.dropna(inplace=True,subset=['oncotree_code'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tcga['oncotree_code'] = tcga['index'].map(tcga_mapper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ccle['oncotree_code'] = ccle['index'].map(ccle_mapper)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tcga.dropna(inplace=True,subset=['oncotree_code'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentati

In [18]:
overall_expression = pd.concat([tcga,ccle])[genes]
overall_expression['oncotree_code'] = list(tcga['oncotree_code'])+list(ccle['oncotree_code'])
overall_expression['Source'] = list(tcga['Source'])+list(ccle['Source'])

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42,n_jobs=32)
tsne_embedding = tsne.fit_transform(overall_expression[genes])

# Create a DataFrame with t-SNE results
tsne_df = pd.DataFrame(tsne_embedding, columns=['TSNE1', 'TSNE2'])
tsne_df['oncotree_code'] = overall_expression['oncotree_code'].values
tsne_df['Source'] = overall_expression['Source'].values


# Optionally, define a symbol map if you want specific markers for each Source
# For example:
symbol_map = {
    'TCGA': 'cross',
    'CCLE': 'circle',
    #'OtherSource': 'diamond'
}

# Create scatter plot with Plotly, adding symbols based on 'Source'
fig = px.scatter(
    tsne_df,
    x='TSNE1',
    y='TSNE2',
    color='oncotree_code',
    symbol='Source',  # Assign different markers based on 'Source'
    symbol_map=symbol_map,  # Uncomment if you defined a symbol_map
    title='t-SNE of Combined TCGA and CCLE Gene Expression Data',
    hover_data=['oncotree_code', 'Source']  # Optionally include in hover
)

# Customize the layout
fig.update_layout(
    legend_title_text='Oncotree Code and Source',  # Adjusted for multiple legends
    legend=dict(
        # You can customize legend layout here if needed
    ),
    width=1000,
    height=800,
)

# Optionally, customize the marker symbols and sizes further
fig.update_traces(marker=dict(size=6, line=dict(width=0.2, color='DarkSlateGrey')))

# Show the plot
fig.show()


# New Celligner

In [19]:
df = pd.read_feather('/home/fcarli/WebCellHit/data/transcriptomics/celligner_CCLE_TCGA_optimized_revised.feather')

In [20]:
tcga_oncontree['mainType'] = tcga_oncontree['oncotree_code'].map(mainTypeMap)
ccle_oncontree['mainType'] = ccle_oncontree['OncotreeCode'].map(mainTypeMap)
tcga_oncontree.dropna(inplace=True,subset=['mainType'])
ccle_oncontree.dropna(inplace=True,subset=['mainType'])

tcga_mapper = dict(zip(tcga_oncontree['sample_id'],tcga_oncontree['mainType']))
ccle_mapper = dict(zip(ccle_oncontree['ModelID'],ccle_oncontree['mainType']))

In [21]:
tcga = df[df['Source']=='TCGA']
ccle = df[df['Source']=='CCLE']

In [22]:
#get all columns except the first two
genes = tcga.columns[2:]
genes

Index(['GPR18', 'GK2', 'NXF5', 'AHCYL1', 'TMOD1', 'MEGF6', 'P2RY14', 'RCC2',
       'XRN1', 'SIGMAR1',
       ...
       'BMP4', 'RBMY1J', 'FXYD5', 'STK38', 'LSM8', 'ISCA2', 'IFNL2', 'INS',
       'DUX4', 'MKRN3'],
      dtype='object', length=18174)

In [23]:
tcga['oncotree_code'] = tcga['index'].map(tcga_mapper)
ccle['oncotree_code'] = ccle['index'].map(ccle_mapper)
tcga.dropna(inplace=True,subset=['oncotree_code'])   
ccle.dropna(inplace=True,subset=['oncotree_code'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [24]:
overall_expression = pd.concat([tcga,ccle])[genes]
overall_expression['oncotree_code'] = list(tcga['oncotree_code'])+list(ccle['oncotree_code'])
overall_expression['Source'] = list(tcga['Source'])+list(ccle['Source'])

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42,n_jobs=32)
tsne_embedding = tsne.fit_transform(overall_expression[genes])

# Create a DataFrame with t-SNE results
tsne_df = pd.DataFrame(tsne_embedding, columns=['TSNE1', 'TSNE2'])
tsne_df['oncotree_code'] = overall_expression['oncotree_code'].values
tsne_df['Source'] = overall_expression['Source'].values


# Optionally, define a symbol map if you want specific markers for each Source
# For example:
symbol_map = {
    'TCGA': 'cross',
    'CCLE': 'circle',
    #'OtherSource': 'diamond'
}

# Create scatter plot with Plotly, adding symbols based on 'Source'
fig = px.scatter(
    tsne_df,
    x='TSNE1',
    y='TSNE2',
    color='oncotree_code',
    symbol='Source',  # Assign different markers based on 'Source'
    symbol_map=symbol_map,  # Uncomment if you defined a symbol_map
    title='t-SNE of Combined TCGA and CCLE Gene Expression Data',
    hover_data=['oncotree_code', 'Source']  # Optionally include in hover
)

# Customize the layout
fig.update_layout(
    legend_title_text='Oncotree Code and Source',  # Adjusted for multiple legends
    legend=dict(
        # You can customize legend layout here if needed
    ),
    width=1000,
    height=800,
)

# Optionally, customize the marker symbols and sizes further
fig.update_traces(marker=dict(size=6, line=dict(width=0.2, color='DarkSlateGrey')))

# Show the plot
fig.show()
