In [1]:
import itertools
import datetime
import pandas as pd
import numpy as np
import scipy.stats as sc
from sys import argv
import random
import math

In [2]:
def rankdata(a, method='average'):
    arr = np.ravel(np.asarray(a))
    algo = 'mergesort' if method == 'ordinal' else 'quicksort'
    sorter = np.argsort(arr, kind=algo)
    inv = np.empty(sorter.size, dtype=np.intp)
    inv[sorter] = np.arange(sorter.size, dtype=np.intp)
    arr = arr[sorter]
    obs = np.r_[True, arr[1:] != arr[:-1]]
    dense = obs.cumsum()[inv]
    # cumulative counts of each unique value
    count = np.r_[np.nonzero(obs)[0], len(obs)]
    # average method
    return .5 * (count[dense] + count[dense - 1] + 1)

def mannwhitneyu(x, y):
    x = np.asarray(x)
    y = np.asarray(y)
    n1 = len(x)
    n2 = len(y)
    ranked = rankdata(np.concatenate((x, y)))
    rankx = ranked[0:n1]  # get the x-ranks
    u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx, axis=0)  # calc U for x
    u2 = n1*n2 - u1  # remainder is U for y
    return(u2)

def variance_pi(x1,x2):
    n1 = len(x1)
    n2 = len(x2)
    wc_stat = mannwhitneyu(x1,x2)
    pi = wc_stat/(n1*n2)
    return(pi)

In [3]:
# Read in count data and metadata

datafile = '/mnt/c/Users/Zarko/Desktop/specs_tests/gt5samps/count_tpm/tpm_output.txt'
metafile = '/mnt/c/Users/Zarko/Desktop/specs_tests/gt5samps/control_gt5samples_lumpHEKs.tsv'

with open(datafile) as f:
    countdata = []
    for line in f:
        data = tuple(line.strip().split("\t"))
        countdata.append(data)

with open(metafile) as f:
    metadata = []
    for line in f:
        meta = tuple(line.strip().split("\t"))
        metadata.append(meta)

countdata_pd = pd.DataFrame(countdata[1:], columns=countdata[0])
countdata_pd.index = countdata_pd.loc[:,"gene_id"]
countdata_pd = countdata_pd.iloc[:,:-2]
countdata_pd = countdata_pd.astype('float')

In [4]:
# Determine unique cell types
celltypes = []
for i in range(len(metadata[0])):
    if metadata[0][i] == 'cell_type':
        index = i

for meta in metadata[1:]:
    celltypes.append(meta[index])

celltypes = np.array(celltypes)
celltypes_unique = np.unique(celltypes)

In [5]:
#calculate weight factor, in this case equally weighted
weight = 1/(len(celltypes_unique)-1)

nsamps = len(celltypes_unique)
nrna = len(countdata) - 1

#read names transcripts
mw_RNA = np.zeros((nrna, nsamps))
s_RNA = np.zeros((nrna, nsamps))
mwtest_RNA = pd.DataFrame(columns=celltypes_unique)

In [7]:
datetime.datetime.now()
i=0
#calculation of score for each transcript
for transcript in countdata[1:]:
    tmp_RNA = transcript[:-2]
    tmp_RNA = np.array([float(value) for value in tmp_RNA])

    j=0
    #run through next loop for each cancer type (d)
    for celltype in celltypes_unique:
        samp_indeces = np.where(celltypes == celltype)[0]      
        tmp_RNA_selected = tmp_RNA[samp_indeces]
        celltypes_non = celltypes_unique[celltypes_unique != celltype]

        ptot_dis = 0
        var_list = []
        p_list = []
        #compare with each cancer type not d (k)
        cov_matrix = np.zeros(shape=(len(celltypes_non),len(celltypes_non)))
        k=0
        for comp_celltype in celltypes_non:
            l=0
            comp_indeces = np.where(celltypes == comp_celltype)[0]
            tmp_RNA_comp = tmp_RNA[comp_indeces]
            wc_stat_kd = variance_pi(tmp_RNA_selected,tmp_RNA_comp)
            ptot_dis = ptot_dis + wc_stat_kd * weight
            p_list.append(wc_stat_kd)
        mw_RNA[i,j] = ptot_dis
        j += 1
    i += 1
datetime.datetime.now()

datetime.datetime(2022, 4, 4, 19, 17, 57, 108006)

In [6]:
datetime.datetime.now()
#run through next loop for each cancer type (d)
for celltype in celltypes_unique:
    celltypes_non = celltypes_unique[celltypes_unique != celltype]
    samp_indeces = np.where(celltypes == celltype)[0]        
    countdata_d = countdata_pd.iloc[:, samp_indeces]
    countdata_d.insert(len(countdata_d.columns), "I_kidj", [0]*len(countdata_d), True)
    #countdata_d.insert(len(countdata_d.columns), "I_kidj_temp", [0]*len(countdata_d), True)
    countdata_d.insert(len(countdata_d.columns), "ptot_dis", [0]*len(countdata_d), True)
    countdata_d.insert(len(countdata_d.columns), "I_kidj_norm", [0]*len(countdata_d), True)

    #compare with each cancer type not d (k)
    for comp_celltype in celltypes_non:
        comp_indeces = np.where(celltypes == comp_celltype)[0]
        countdata_k = countdata_pd.iloc[:, comp_indeces]
        countdata_d.loc[:,'I_kidj'] = [0]*len(countdata_d)
        for colk in countdata_k.columns:
            for cold in countdata_d.columns[:-3]:
#                comp_df = pd.DataFrame(colk: )
                countdata_d.loc[:,'I_kidj'] = np.where((countdata_k.loc[:,colk] < countdata_d.loc[:,cold]), (countdata_d.loc[:,'I_kidj'] + 1), countdata_d.loc[:,'I_kidj'])
    #            countdata_d.loc[:,'I_kidj'] = countdata_d.loc[:,'I_kidj_temp']
        countdata_d.loc[:,'I_kidj_norm'] = countdata_d.loc[:,'I_kidj']/(len(countdata_k.columns)*len(countdata_d.columns[:-3]))
        countdata_d.loc[:,'ptot_dis'] = countdata_d.loc[:,'ptot_dis'] + countdata_d.loc[:,'I_kidj_norm'] * weight
    mwtest_RNA[celltype] = countdata_d['ptot_dis']
datetime.datetime.now()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


datetime.datetime(2022, 4, 4, 18, 13, 40, 572715)

In [8]:
#writing output
rnas = []
for entry in countdata[1:]:
    rnas.append(entry[-2])

In [9]:
mw_RNA = pd.DataFrame(mw_RNA, index=rnas, columns=celltypes_unique)
mw_RNA.to_csv('/mnt/c/Users/Zarko/Desktop/specs_tests/gt5samps/specs_results_pall.txt', sep='\t', index=True, header=True)

mw_max_type = pd.concat([mw_RNA.max(axis=1),mw_RNA.idxmax(axis=1)], axis=1)
mw_max_type = mw_max_type.sort_values(by=[0,1], axis=0, ascending=False)
mw_max_type.to_csv('/mnt/c/Users/Zarko/Desktop/specs_tests/gt5samps/specs_results_onco_out.txt', sep='\t', index=True, header=False)

mw_min_type = pd.concat([mw_RNA.min(axis=1),mw_RNA.idxmin(axis=1)], axis=1)
mw_min_type = mw_min_type.sort_values(by=[0,1], axis=0, ascending=False)
mw_min_type.to_csv('/mnt/c/Users/Zarko/Desktop/specs_tests/gt5samps/specs_results_tumsup_out.txt', sep='\t', index=True, header=False)

In [10]:
mwtest_RNA.to_csv('/mnt/c/Users/Zarko/Desktop/specs_tests/gt5samps/specs_results_otheriter_pall.txt', sep='\t', index=True, header=True)

mw_max_type = pd.concat([mwtest_RNA.max(axis=1),mwtest_RNA.idxmax(axis=1)], axis=1)
mw_max_type = mw_max_type.sort_values(by=[0,1], axis=0, ascending=False)
mw_max_type.to_csv('/mnt/c/Users/Zarko/Desktop/specs_tests/gt5samps/specs_results_otheriter_onco_out.txt', sep='\t', index=True, header=False)

mw_min_type = pd.concat([mwtest_RNA.min(axis=1),mwtest_RNA.idxmin(axis=1)], axis=1)
mw_min_type = mw_min_type.sort_values(by=[0,1], axis=0, ascending=False)
mw_min_type.to_csv('/mnt/c/Users/Zarko/Desktop/specs_tests/gt5samps/specs_results_otheriter_tumsup_out.txt', sep='\t', index=True, header=False)