In [2]:
import os
import numpy as np
from sklearn.naive_bayes import GaussianNB

In [3]:
import pandas as pd
ROOT_PATH = "C:\\Users\\micha\\Downloads\\"

In [4]:
num_of_genes = 500
num_of_samples = 2000

In [5]:
#order of samples in this file defines sample id
sample_columns = os.path.join(ROOT_PATH, "Sample_Columns.txt")
sample_file = open(sample_columns,'r')
sample_names = []
for line in sample_file:
    parts = line.split('\t')
    for part in parts:
        sample_name = part.rstrip()
        sample_names.append(sample_name)

#remove Name and Description entries from sample_names
del(sample_names[0])
del(sample_names[0])

#keep track of sample ids in dictionary for later reference
sample_to_id = {}
#local sample ids are sequential starting with 1
sample_id = 1
for name in sample_names:
    sample_to_id[name] = sample_id
    sample_id = sample_id + 1

In [7]:
#connect sample id to tissue id
sample_to_tissue = {}
tissue_to_id = {}
tissues = []
sample_tissue_path = os.path.join(ROOT_PATH, "Sample_to_Tissue.txt")
sample_tissue_file = open(sample_tissue_path,'r')
tissue_id = 1
for line in sample_tissue_file:
    parts = line.split('\t')
    sample_name = parts[0].rstrip()
    if (sample_name in sample_to_id.keys()):
        sample_id = sample_to_id[sample_name]
        tissue = parts[1].rstrip()
        tissue = tissue.replace(',', ' ')
        #tissue ids are based on order discovered in Sample_to_Tissue.txt 
        if (tissue not in tissue_to_id.keys()):
            tissue_to_id[tissue] = tissue_id
            tissues.append(tissue)
            tissue_id = tissue_id + 1
        sample_to_tissue[sample_id] = tissue_to_id[tissue]

In [8]:
#the main data file, approximately 56K transcripts and 15K genes
tpm_data = os.path.join(ROOT_PATH, "GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct")
#tpm data starts in 3rd row, 3 column 
reader = pd.read_table(tpm_data, sep = "\t",skiprows = [0,1], nrows=num_of_genes)

In [9]:
#tpm values start in 3rd column of reader for sample 1, so index (i) begins at 2
#training data set consist of samples with odd-numbered sample ids
i = 2
sample_id = 1
train_target_list = []
#train_data is ndarray for tpm arrays for all samples 
train_data = np.empty([0,num_of_genes])
#data_for_one_sample is ndarray placeholder
data_for_one_sample = np.array(0)
while i < 2 + num_of_samples:
    #train_data_list holds tpm values for one sample in temp list
    train_data_list = []
    tissue_id = sample_to_tissue[sample_id] 
    #each column contains expression values for each gene in the reader for a sample
    for index, column in reader.iterrows() :
        train_data_list.append(round(column[i],0))
    data_for_one_sample = np.array(train_data_list)
    train_target_list.append(tissue_id)
    sample_id =  sample_id + 2
    i = i + 2
    train_data = np.vstack((train_data, data_for_one_sample))

In [10]:
train_targets = np.array(train_target_list)

In [11]:
#test data set consist of samples with even-numbered ids starting with sample id = 2
i = 3
sample_id = 2
test_target_list = []
test_data = np.empty([0,num_of_genes])
data_for_one_sample = np.array(0)
while i < 2 + num_of_samples:
    test_data_list = []
    tissue_id = sample_to_tissue[sample_id] 
    #each column contains expression values for each gene in a sample
    for index, column in reader.iterrows() :
        test_data_list.append(round(column[i],0))
    data_for_one_sample = np.array(test_data_list)
    test_target_list.append(tissue_id)
    sample_id =  sample_id + 2
    i = i + 2
    test_data = np.vstack((test_data, data_for_one_sample))

In [12]:
#put targets/classes (tissue ids) into ndarray
test_targets = np.array(test_target_list)

In [13]:
gnb = GaussianNB()

In [14]:
y_pred = gnb.fit(train_data,train_targets).predict(test_data)

In [15]:
print("Number of mislabeled points out of a total of %d points : %d" %(test_data.shape[0],(test_targets != y_pred).sum()) )

Number of mislabeled points out of a total of 1000 points : 118


In [None]:
#need to get distribution of tissue types in train and test data sets
#need to get distribution of tissue types for mislabeled points