In [1]:
import pandas as pd
import random
import itertools
import numpy as np
import time 
import pickle
from scipy.stats import ttest_ind, chi2_contingency, fisher_exact, ranksums

1. Split single tumor cells from naive into 2 groups: high basal signature (top 25% of tumor cells) and low basal signature (bottom 25% of tumor cells). Perform Wilcoxon rank sum test (2 sided) between those two cell groups to determine which one TACSTD2 has higher expression in.
2. Mean expression value of TACSTD2 in naive tumor cells, naive ductal cells, treated tumor cells, treated ductal cells. 
3. number of cells that have non zero expression of TACSTD2 in naive tumor cells, treated tumor cells, naive ductal cells, treated ductal cells. If you happened to know the total number of cells for naive tumor/treated tumor/naive ductal/treated ductal, so that I can calculate a proportion, that would be awesome.

These analyses are basically giving us further proof of whether or not TACSTD2 is a big target in pancreatic cancer, and then whether it is highly enriched in the basal-like group.

In [2]:
gene = 'TACSTD2'

In [9]:
columns_naive = pd.read_csv('../data/columns for X.csv',header = None)
columns_naive = list(columns_naive[0])

rows_naive = pd.read_csv('../data/rows for X.csv',header = None)
cell_type_naive = list(rows_naive[0])
cell_type_naive = [val.lower() for val in cell_type_naive]

In [4]:
rows_treated = pd.read_csv('../data/treated snRNA cell types list.csv',header = None)
cell_type_treated = list(rows_treated[0])
cell_type_treated = [val.lower() for val in cell_type_treated]

columns_treated = pd.read_csv('../data/treated snRNA gene list.csv', header = None)
columns_treated = list(columns_treated[0])

### config - choose 1 below

In [53]:
name = 'treated'
single_cell_file = '../data/X treated.csv'
basal_file = 'outputs/basal_gene_scores_treated.csv'
cell_type = cell_type_treated
columns = columns_treated

In [64]:
name = 'naive'
single_cell_file = '../data/X.csv'
basal_file = 'outputs/basal_gene_scores_naive.csv'
cell_type = cell_type_naive
columns = columns_naive

## Read data

In [None]:
df_gene = pd.read_csv(single_cell_file, names = columns, usecols = [gene])
df_basal = pd.read_csv(basal_file) 
df_basal[gene] = df_gene[gene].values
df_basal_tumor_gene = df_basal[df_basal['cell_type'] == 'tumor'][['score',gene]]

In [None]:
df_gene['cell_type'] = cell_type

In [None]:
df_gene['cell_type'].value_counts()

## Statistical Testing

In [None]:
high_values = df_basal_tumor_gene[df_basal_tumor_gene['score'] >= np.percentile(df_basal_tumor_gene['score'], 75)][gene].values
low_values = df_basal_tumor_gene[df_basal_tumor_gene['score'] <= np.percentile(df_basal_tumor_gene['score'], 25)][gene].values

_,ranksums_pval = ranksums(low_values,high_values)

In [None]:
ranksums_pval

In [None]:
high_values.mean()

In [None]:
low_values.mean()

## Mean Values in different cell types

In [None]:
np.round(df_gene[df_gene['cell_type'] == 'ductal'][gene].mean(),4)

In [None]:
zero_count_ductal = np.sum(df_gene[df_gene['cell_type'] == 'ductal'][gene] == 0)

rows_ductal = df_gene[df_gene['cell_type'] == 'ductal'].shape[0]

print(zero_count_ductal)
print(rows_ductal)
print(np.round(zero_count_ductal/rows_ductal,4))

In [None]:
np.round(df_gene[df_gene['cell_type'] == 'tumor'][gene].mean(),4)

In [None]:
zero_count_tumor = np.sum(df_gene[df_gene['cell_type'] == 'tumor'][gene] == 0)

rows_tumor = df_gene[df_gene['cell_type'] == 'tumor'].shape[0]

print(zero_count_tumor)
print(rows_tumor)
print(np.round(zero_count_tumor/rows_tumor,4))