In [1]:
import GEOparse
import pandas as pd

def download_experiment(experiment):
    GEOparse.get_GEO(geo=experiment, destdir="data/")

def process_experiment(experiment):
    return GEOparse.get_GEO(filepath="data/" + experiment + "_family.soft.gz")

def aggregate_samples(experiment, download=True):  
    '''
    Wrapper function to download and process a given experiment ID,
    aggregate the samples and write them to a csv file as a matrix.
    '''
    if download:
        download_experiment(experiment)
    print('process experiment...')
    gse = process_experiment(experiment)
    aggregated_samples = pd.DataFrame()
    genes = pd.DataFrame()
    print('aggregating samples...')
    for sample in gse.gsms:
        name = gse.gsms[sample].metadata['title'][0]
        sample_df = gse.gsms[sample].table
        sample_df.columns = ['gene', name]
        genes = sample_df['gene']
        aggregated_samples = pd.concat([aggregated_samples, sample_df[name]], axis=1)    
    aggregated_samples = pd.concat([genes, aggregated_samples], axis=1)
    aggregated_samples.to_csv('data/' + experiment + '_full_matrix.csv') 
    print('aggregated samples written to csv file')

In [2]:
from collections import Counter, defaultdict
from scipy.stats import multivariate_normal
from sklearn.preprocessing import normalize
import numpy as np
import random

print('reading data matrix...')
df = pd.read_csv('data/eichenberger_full_matrix.csv').fillna(0)
gene_data = normalize(df.values[:4177, 2:].astype(float), axis=0, norm='l1')
sample_data = normalize(df.values[:, 2:].astype(float), axis=0, norm='l1').transpose()

i2s = list(df)[2:] # index to sample name
s2i = {sample: i for i, sample in enumerate(i2s)} # sample name to index

i2g = df.values[:4177, 1] # index to gene name
g2i = {gene: i for i, gene in enumerate(i2g)} # gene name to index

def distribution_parameters(gene, genes):
    gene_differences = np.array([g-gene for g in genes])
    covariance = np.cov(gene_differences, rowvar=False)
    mean = np.mean(gene_differences, axis=0)
    return mean, covariance

def score_multivariate(x, mean, covariance):
    return 1-multivariate_normal.pdf(x, mean=mean, cov=covariance)

def score_cosine(x, y):
    return (np.dot(x, np.transpose(y))/(np.linalg.norm(x)*np.linalg.norm(y)))

# def sample_similarities(similarity_function=gene_similarity_cosine):
    
def sequential_similarities(data, similarity_function=score_cosine):
    print("calculating sequential similarity...")
    similarities = [similarity_function(data[i], data[i+1]) for i in range(0, len(data)-1)]
    return similarities

def similarities(data, similarity_function=score_cosine):
    print("calculating all similarities...")
    similarities = []
    for x in data:       
        similarities.append([similarity_function(x, y) for y in data])
    return similarities

gene_similarities = sequential_similarities(gene_data)
# multivariate_similarities = sequential_similarities[]
sample_similarities_cosine = similarities(sample_data)

def most_similar_samples(sample, sample_similarities, n=5):
    print('Finding most similar samples for', sample + '...')
    s = sample_similarities[s2i[sample]]
    best = reversed(np.argsort(s)[-n:])
    return [i2s[b] for b in best]

random_sample = i2s[random.randint(0, len(i2s))]
print(most_similar_samples(random_sample, sample_similarities_cosine))

def threshold_operons_sequential(similarities, threshold=0.6):
    '''threshold a sequential list of similarities. 
    This means the spatial location of the genes is used
    '''
    cluster = 0
    threshold_operons = defaultdict(list)
    print('thresholding similarity into operons...')
    for i, sim in enumerate(similarities):
        if i == 0:
            threshold_operons[cluster].append(i2g[i])
            continue
        if sim > threshold:
            threshold_operons[cluster].append(i2g[i])
        else:
            cluster += 1
            threshold_operons[cluster].append(i2g[i])
    labels = [label for label, operon in threshold_operons.items() for _ in operon]
    print('done.')
    return labels
    
predicted_operons = threshold_operons_sequential(gene_similarities)

reading data matrix...
calculating sequential similarity...
calculating all similarities...
Finding most similar samples for Vanco1_T5...
['Vanco1_T5', 'SigD2_T0', 'RemA1_T15', 'RemA1_T5', 'Com2_T0']
thresholding similarity into operons...
done.


In [3]:
import csv
import json
from sklearn.metrics import adjusted_rand_score

def gold_operons(path):
    operons_per_id = defaultdict(list)
    with open(path, 'r') as infile:
        reader = csv.reader(infile, delimiter='\t')
        header = next(reader)
        for line in reader:
            operon_id = line[0]
            gene_id = line[2]
            operons_per_id[operon_id].append(gene_id)
    labels = [label for label, operon in enumerate(operons_per_id.values()) for _ in operon]
    return labels


gold_operons = gold_operons('data/1240.opr')

benchmark_prediction = [i for i, gene in enumerate(i2g)][:-1]


print('The adjusted Rand score of the benchmark is:',
      adjusted_rand_score(gold_operons, benchmark_prediction))

print('The adjusted Rand score of the predicted operons is:',
      adjusted_rand_score(gold_operons, predicted_operons))
    



The adjusted Rand score of the benchmark is: 0.0
The adjusted Rand score of the predicted operons is: 0.19767824120251337
