In [119]:
import numpy as np 
import pandas as pd
from synapseclient import Synapse
import numpy.ma as ma

In [120]:
def cutMissingValues (data, threshold):
    
    ''' Function takes in data and cuts rows & columns 
    that have more missing values than the threshold set.
    
    Inputs: data to be cut, threshold to keep (as a fraction)
    
    Returns: cut data set '''
    
    uncutData = data
    rows = uncutData.index
    cols = uncutData.columns
    data_size = data.shape
    
    print(data_size)
    
    masked_rows = []
    for i in range(len(rows)):
        masked_rows.append(0)
    masked_cols = []
    for i in range(len(cols)):
        masked_cols.append(0)

    
    uncutData = pd.DataFrame.to_numpy(data)
    
    cutData = uncutData
    data_size = uncutData.shape
    
    limit_rows = (1 - threshold)*data_size[1]
    limit_cols = (1 - threshold)*data_size[0]
    
    cut_row_count = 0
    cut_col_count = 0
        
    
    #cut along genes
    
    for row_name in range(data_size[0]):
        count = 0
        for col_name in range(data_size[1]):
            if np.isnan(uncutData[row_name,col_name]):
                count += 1
        if count >= limit_rows:
            cutData = np.delete(cutData, cut_row_count, 0)
            masked_rows[row_name] = 1
            cut_row_count -= 1
        cut_row_count += 1
        
    #cut along cell lines
    data_size = cutData.shape
    freshlyChopped = cutData
    
    for col_name in range(data_size[1]):
        count = 0
        for row_name in range(data_size[0]):
            if np.isnan(cutData[row_name,col_name]):
                count += 1
        if count >= limit_cols:
            freshlyChopped = np.delete(freshlyChopped, cut_col_count, 1)
            masked_cols[col_name] = 1
            cut_col_count -= 1
        cut_col_count += 1
        
    rows = ma.masked_array(rows, masked_rows)
    rows = rows.compressed()
    cols = ma.masked_array(cols, masked_cols)
    cols = cols.compressed()

    
    df_labeled = pd.DataFrame(data=freshlyChopped, columns = cols, index = rows)
    print(df_labeled.shape)
    
    return (df_labeled)
    

In [121]:
syn = Synapse()
syn.login('NilayShah', 'nilayisthebest')
meth_data = pd.read_csv(syn.get('syn21303732').path, index_col = 0)
copy_data = pd.read_csv(syn.get('syn21303730').path, index_col = 0)
gene_data = pd.read_csv(syn.get('syn21303731').path, index_col = 0)

Welcome, NilayShah!



In [122]:
cutMissingValues(meth_data, 0.9)

(12937, 807)
(11402, 765)


Unnamed: 0,22RV1_PROSTATE,2313287_STOMACH,42MGBA_CENTRAL_NERVOUS_SYSTEM,5637_URINARY_TRACT,639V_URINARY_TRACT,647V_URINARY_TRACT,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,769P_KIDNEY,786O_KIDNEY,8305C_THYROID,...,WSUDLCL2_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,YAPC_PANCREAS,YD10B_UPPER_AERODIGESTIVE_TRACT,YD15_SALIVARY_GLAND,YD38_UPPER_AERODIGESTIVE_TRACT,YD8_UPPER_AERODIGESTIVE_TRACT,YH13_CENTRAL_NERVOUS_SYSTEM,YKG1_CENTRAL_NERVOUS_SYSTEM,ZR751_BREAST,ZR7530_BREAST
1,0.788070,0.117720,0.91525,0.023920,0.769500,0.16228,0.931820,0.378840,0.33363,0.673350,...,0.98150,0.60960,0.647700,0.393850,0.07185,0.651590,0.913310,0.46315,0.82355,0.74650
14,0.023930,0.014320,0.00128,0.000170,0.008980,0.01691,0.041700,0.000690,0.02311,0.004000,...,0.00000,0.00350,0.004410,0.002830,0.00059,0.000760,0.004280,0.00779,0.00953,0.00534
15,0.252365,0.414615,0.38853,0.418785,0.491585,0.32605,0.518945,0.436735,0.48101,0.211985,...,0.37815,0.45394,0.441405,0.460635,0.40334,0.430215,0.348515,0.27263,0.48338,0.49779
16,0.000000,0.000000,0.08046,0.056800,0.037170,0.09474,0.520020,0.033330,0.18332,0.012350,...,0.00000,0.02186,0.036970,0.028170,0.00000,0.034550,0.000000,0.02469,0.00565,0.00000
18,0.059850,0.229000,0.16486,0.156600,0.105550,0.00000,0.398150,0.117540,0.30590,0.175500,...,0.05157,0.03464,0.180720,0.232680,0.07590,0.217230,0.028910,0.06284,0.01527,0.05626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100874051,0.021610,0.004800,0.90980,0.719400,0.131730,0.01513,0.832500,0.584970,0.31818,0.004270,...,0.31442,0.52803,0.505860,0.034700,0.40250,0.656690,0.121230,0.55464,0.12693,0.01110
100874054,0.023700,0.169900,0.27832,0.263620,0.340860,0.35266,0.344720,0.336490,0.35780,0.291440,...,0.25550,0.25478,0.352320,0.319050,0.24735,0.308000,0.150190,0.20859,0.26786,0.25072
100885848,0.018390,0.036670,0.00000,0.000090,0.033540,0.00000,0.146670,0.006580,0.00935,0.006070,...,0.00175,0.00689,0.002690,0.000000,0.00450,0.001230,0.013860,0.01076,0.00761,0.00186
100885850,0.000000,0.015600,0.01379,0.000420,0.015650,0.00000,0.096360,0.005270,0.00571,0.005870,...,0.00000,0.00708,0.007970,0.001840,0.00949,0.008790,0.006420,0.00897,0.00702,0.00340


In [123]:
cutMissingValues(copy_data, 0.9)

(12937, 807)
(12937, 807)


Unnamed: 0,22RV1_PROSTATE,2313287_STOMACH,42MGBA_CENTRAL_NERVOUS_SYSTEM,5637_URINARY_TRACT,639V_URINARY_TRACT,647V_URINARY_TRACT,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,769P_KIDNEY,786O_KIDNEY,8305C_THYROID,...,WSUDLCL2_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,YAPC_PANCREAS,YD10B_UPPER_AERODIGESTIVE_TRACT,YD15_SALIVARY_GLAND,YD38_UPPER_AERODIGESTIVE_TRACT,YD8_UPPER_AERODIGESTIVE_TRACT,YH13_CENTRAL_NERVOUS_SYSTEM,YKG1_CENTRAL_NERVOUS_SYSTEM,ZR751_BREAST,ZR7530_BREAST
1,-0.0762,-0.0331,0.0346,-0.0324,0.0797,0.2934,0.0661,0.0463,-0.2432,0.4662,...,0.0057,-0.5882,-0.5956,-0.5522,-0.0281,-0.1666,0.5718,0.2424,0.1895,0.4302
9,0.5510,-0.0018,-0.0394,-0.5158,-0.3367,-0.2273,0.0762,0.0753,-0.7908,-0.4753,...,0.5566,-0.5760,-0.6030,-0.5393,-0.0504,-0.7045,0.0954,0.2484,-0.1390,-0.8947
14,-0.0040,-0.0362,0.2935,0.0219,-0.3293,-0.2808,-0.0031,0.0312,0.1389,-0.0477,...,-0.0352,0.0161,-0.0924,0.0457,0.0945,-0.1581,-0.3405,-0.0276,-0.0308,-0.2358
15,-0.0478,-0.0029,0.1513,0.6126,-0.0969,0.0690,0.0218,0.1109,0.1658,-0.0358,...,0.0051,0.4052,-0.0552,0.7502,0.0468,-0.1307,0.1537,0.2615,0.7059,-0.2805
16,-0.0739,-0.0411,-0.0844,-0.5211,-0.1711,0.2547,0.0135,0.0345,0.1361,0.0225,...,0.0062,-0.1226,-0.0431,0.0295,0.0627,0.2281,0.0695,0.1103,0.8304,-0.7924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100874051,0.5510,-0.0018,-0.1097,-0.5158,-0.3367,-0.2273,0.0762,0.0753,-0.7908,-0.6080,...,0.5566,-0.5760,-0.6030,-0.5393,-0.0504,-0.7045,0.0954,0.2484,-0.1390,-0.8947
100874054,0.5268,0.5451,-0.3028,0.0783,-0.2570,0.2858,-0.0088,0.0551,0.1829,-0.1625,...,0.0134,0.0538,0.2786,0.1021,0.4363,-0.0662,0.0958,0.2692,1.0386,0.7997
100885848,-0.0404,-0.0381,-0.2949,0.5999,-0.0399,0.1065,0.0232,0.1124,-0.1453,-0.0162,...,0.0354,0.4212,-0.0336,0.0377,-0.4544,-0.1240,0.1274,0.2265,-0.0682,-0.8426
100885850,-0.0404,-0.0381,-0.2949,0.5999,-0.0399,0.1065,0.0232,0.1124,-0.1453,-0.0162,...,0.0354,0.4212,-0.0336,0.0377,-0.4544,-0.1240,0.1274,0.2265,-0.0682,-0.8426


In [124]:
cutMissingValues(gene_data, 0.9)

(12937, 807)
(12937, 807)


Unnamed: 0,22RV1_PROSTATE,2313287_STOMACH,42MGBA_CENTRAL_NERVOUS_SYSTEM,5637_URINARY_TRACT,639V_URINARY_TRACT,647V_URINARY_TRACT,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,769P_KIDNEY,786O_KIDNEY,8305C_THYROID,...,WSUDLCL2_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,YAPC_PANCREAS,YD10B_UPPER_AERODIGESTIVE_TRACT,YD15_SALIVARY_GLAND,YD38_UPPER_AERODIGESTIVE_TRACT,YD8_UPPER_AERODIGESTIVE_TRACT,YH13_CENTRAL_NERVOUS_SYSTEM,YKG1_CENTRAL_NERVOUS_SYSTEM,ZR751_BREAST,ZR7530_BREAST
1,0.270077,0.012558,3.553860,0.016376,2.626260,0.110825,4.553720,0.008083,0.000000,5.322780,...,7.193730,0.017141,0.015510,0.100211,0.025264,0.176669,1.789680,1.738210,4.587100,3.219190
9,10.156100,5.014450,1.278110,1.110990,1.131650,1.720320,1.795090,1.413660,0.947946,0.824009,...,2.959090,2.062800,0.929703,1.282190,1.566750,1.214470,1.170670,1.224780,22.618300,3.073650
14,54.813000,41.643300,47.722600,49.475200,86.888700,77.752700,53.083300,57.776500,42.340600,55.718800,...,29.046400,76.661900,44.900500,62.869900,59.161200,38.332400,31.314200,58.665100,51.351600,57.549900
15,0.090211,0.016779,0.018262,0.075014,0.088340,0.063458,0.033174,0.093591,0.194073,0.065371,...,0.342984,0.389333,0.151970,0.276147,0.725726,0.021458,0.099286,0.142916,0.126442,0.011709
16,113.312000,171.691000,79.601600,24.907300,55.091300,84.870100,105.361000,88.410800,93.776100,70.009900,...,260.385000,61.556400,72.572600,72.534800,143.359000,99.557700,72.884700,67.631400,61.729100,41.425800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100874051,0.000000,0.000000,0.008967,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.588470,0.014057,0.000000,0.000000,0.000000,0.000000,0.020314,0.009747,0.000000,0.000000
100874054,0.993447,0.243456,0.056529,0.193498,0.683623,0.422869,0.410747,0.041784,0.187730,0.066562,...,0.030161,0.000000,0.053456,0.000000,0.750983,0.124542,0.032014,0.875548,2.215950,0.679572
100885848,0.645949,0.375564,0.415258,0.159909,0.492137,0.220450,0.645891,0.276245,0.657695,0.224920,...,0.299107,1.025240,0.098170,0.000000,0.000000,0.747145,0.293960,0.338509,0.158553,0.160855
100885850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.431137,0.000000,0.000000,0.000000,0.000000
