Let's test and see if our expect value approach of interpreting the similarity scores makes sense and works on a real data set.
In this real data set, we know the genes that were involved in anatomical changes in certain species and also know what the anatomical changes are.

## 1. First, we have to simulate a random data set based on the real data set that we have.

In [1]:
import pandas as pd
import numpy as np
from random import randint

In [13]:
genes = pd.read_csv('../../data/real_data_experiment/input_files/Annotations_KBGenes2016.txt',\
                    sep='\t', header=None, \
                    names=['geneID', 'uberon_val', 'pato_val'])
taxon = pd.read_csv('../../data/real_data_experiment/input_files/Swartz_phenotypic_profiles.txt',\
                           sep='\t', header=0)

In [12]:
print(len(genes.gene.unique()))

14706


For both gene and taxon data we will have to generate random.
For gene data, generate random data for uberon and pato columns.
For taxon data, generate random data for Entity ID, Quality ID and
Related Entity ID columns.

In [14]:
uberon = genes['uberon_val'].copy(deep=True)
pato = genes['pato_val'].copy(deep=True)
entity_id = taxon['Entity ID'].copy(deep=True)
quality_id = taxon['Quality ID'].copy(deep=True)
related_entity_id = taxon['Related Entity ID'].copy(deep=True)

In [15]:
uberon.dropna(axis=0, inplace=True)
uberon.reset_index(inplace=True, drop=True)
pato.dropna(axis=0, inplace=True)
pato.reset_index(inplace=True, drop=True)
entity_id.dropna(axis=0, inplace=True)
entity_id.reset_index(inplace=True, drop=True)
quality_id.dropna(axis=0, inplace=True)
quality_id.reset_index(inplace=True, drop=True)
related_entity_id.dropna(axis=0, inplace=True)
related_entity_id.reset_index(inplace=True, drop=True)

In [16]:
iter = 0
for i in genes.index:
    if pd.isnull(genes.at[i, 'uberon_val']) == False:        
        j = randint(0, len(uberon)-1)
        genes.at[i, 'uberon_val'] = uberon.at[j]
    if pd.isnull(genes.at[i, 'pato_val']) == False:        
        t = randint(0, len(pato)-1)
        genes.at[i, 'pato_val'] = pato.at[t]
    iter+=1

In [17]:
for i in taxon.index:
    if pd.isnull(taxon.at[i, 'Entity ID']) == False:
        j = randint(0, len(entity_id)-1)
        taxon.at[i, 'Entity ID'] = entity_id.at[j]
    if pd.isnull(taxon.at[i, 'Quality ID']) == False:
        t = randint(0, len(quality_id)-1)
        taxon.at[i, 'Quality ID'] = quality_id.at[t]
    if pd.isnull(taxon.at[i, 'Related Entity ID']) == False:
        z = randint(0, len(related_entity_id)-1)
        taxon.at[i, 'Related Entity ID'] = related_entity_id.at[z]

In [18]:
print(len(genes))
print(iter)
print(len(genes.gene.unique()))
# print(genes.head())
# print(taxon.head())

1056681
1056681


AttributeError: 'DataFrame' object has no attribute 'gene'

In [19]:
genes.to_csv("../../data/real_data_experiment/input_files/Annotations_KBGenes2016_random.txt", index=False, 
              sep="\t")
taxon.to_csv("../../data/real_data_experiment/input_files/Swartz_phenotypic_profiles_random.txt", index=False, 
              sep="\t")

## 2. Obtain the profile sizes for genes and phenotypes. Then create a file with gene ids, profile sizes of genes, taxons, profile sizes of taxons, sim score, evidence.

In [34]:
phenotype_pr_sizes = []
gene_pr_sizes = []
phenotype_pr_sizes_dict = dict()
gene_pr_sizes_dict = dict()
taxon_random = pd.read_csv('../../data/real_data_experiment/input_files/Swartz_phenotypic_profiles_random.txt',\
                           sep='\t', header=0)
genes_random = pd.read_csv('../../data/real_data_experiment/input_files/Annotations_KBGenes2016_random.txt',\
                    sep='\t', header=0)
genes_random.dropna(axis=0, inplace=True, subset=['uberon_val'])
scores = pd.read_csv('../../data/real_data_experiment/experiment/results/E_bp_sym_pic_resnik_Results_random.tsv',\
                           sep='\t', header=0)
for i in scores.index:
    taxon = scores.at[i, 'Character Number']
    gene = scores.at[i, 'GeneID']
#     print(taxon)    
    if taxon not in phenotype_pr_sizes_dict:
        taxon_size = taxon_random[taxon_random['Character']==taxon].Character.count()
        phenotype_pr_sizes_dict[taxon] = taxon_size
    else:
        taxon_size = phenotype_pr_sizes_dict[taxon]
    if gene not in gene_pr_sizes_dict:
        gene_size = genes_random[genes_random['geneID']==gene].geneID.count()
        gene_pr_sizes_dict[gene] = gene_size
    else:
        gene_size = gene_pr_sizes_dict[gene]    
#     print(taxon_size)
    phenotype_pr_sizes.append(taxon_size)
    gene_pr_sizes.append(gene_size)

In [33]:
print(phenotype_pr_sizes)

[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,

In [37]:
results = scores.copy(deep=True)
results['taxon_size'] = phenotype_pr_sizes
results['gene_size'] = gene_pr_sizes
print(len(results))

88110


In [39]:
results.to_csv("../../data/real_data_experiment/experiment/results/random_scores_sizes.tsv", index=False, 
              sep="\t")

## 3. Eliminate all the genes that have profile size zero.
This is needed in order to apply later on box cox transformation, which accepts only positive, non zero inputs.

In [2]:
input = pd.read_csv('../../data/real_data_experiment/experiment/results/random_scores_sizes.tsv',
                   header=0, sep='\t')

input_filtered = input[input.gene_size != 0]
input_filtered = input_filtered[input_filtered['Median nIC'] != 0]
input_filtered.to_csv("../../data/real_data_experiment/experiment/results/random_scores_sizes_filtered.tsv", index=False, 
              sep="\t")