In [1]:
import GEOparse
import pandas as pd

def download_experiment(experiment):
    GEOparse.get_GEO(geo=experiment, destdir="data/")

def read_experiment(experiment):
    return GEOparse.get_GEO(filepath="data/" + experiment + "_family.soft.gz")

def aggregate_samples(experiment):
    # download_experiment(experiment)
    print('reading experiment...')
    gse = read_experiment(experiment)
    aggregated_samples = pd.DataFrame()
    genes = pd.DataFrame()
    print('aggregating samples...')
    for sample in gse.gsms:
        name = gse.gsms[sample].metadata['title'][0]
        sample_df = gse.gsms[sample].table
        sample_df.columns = ['gene', name]
        genes = sample_df['gene']
        aggregated_samples = pd.concat([aggregated_samples, sample_df[name]], axis=1)    
    aggregated_samples = pd.concat([genes, aggregated_samples], axis=1)
    aggregated_samples.to_csv('data/' + experiment + '_full_matrix.csv') 
    print('aggregated samples written to csv file')

eichenberger = 'GSE67023'
# aggregate_samples(eichenberger)


In [27]:
from collections import Counter, defaultdict
from scipy.stats import multivariate_normal
from scipy import spatial
from sklearn.preprocessing import normalize
import numpy as np

print('reading data...')
df = pd.read_csv('data/eichenberger_full_matrix.csv').fillna(0)
gene_data = normalize(df.values[:, 2:].astype(float), axis=0, norm='l1')
sample_data = normalize(df.values[:, 2:].astype(float), axis=0, norm='l1').transpose
conditions = df.columns
i2g = df.values[:, 1]
g2i = {gene: i for i, gene in enumerate(i2g)}

def distribution_parameters(gene, genes):
    gene_differences = np.array([g-gene for g in genes])
    covariance = np.cov(gene_differences, rowvar=False)
    mean = np.mean(gene_differences, axis=0)
#     distribution = np.random.multivariate_normal(mean, covariance)
    return mean, covariance

def gene_similarity_multivariate(gene, mean, covariance):
    return multivariate_normal.pdf(gene, mean=mean, cov=covariance)

def gene_similarity_cosine(gene1, gene2):
    return spatial.distance.cosine(gene1, gene2)

# def similarities(similarity_function=gene_similarity_cosine):
    
def sequential_similarities(similarity_function=gene_similarity_cosine):
    print("calculating sequential similarity...")
    similarities = []
    for i in range(0, len(i2g)-1):       
        test_gene = cluster_data[i]
        test_gene2 = cluster_data[i+1]
        similarity = similarity_function(test_gene, test_gene2)
        similarities.append(similarity)
    return np.array(similarities)/np.max(similarities)

cosine_similarities = sequential_similarities()
multivariate_similarities = sequential_similarities[]


def threshold_operons_sequential(similarities, threshold=0.5):
    '''threshold a sequential list of similarities. 
    This means the spatial location of the genes is used
    '''
    cluster = 0
    threshold_operons = defaultdict(list)
    print('thresholding similarity into operons...')
    for i, sim in enumerate(similarities):
        if sim > threshold:
            threshold_operons[cluster].append(i2g[i])
        else:
            cluster += 1
            threshold_operons[cluster].append(i2g[i])
    print('done.')
    return threshold_operons
    
operons = threshold_operons_sequential(cosine_similarities)

reading data...
calculating sequential similarity...
thresholding similarity into operons...
done.


In [3]:
from sklearn.cluster import KMeans, AffinityPropagation
# AP = AffinityPropagation(damping = 0.5, max_iter = 250, affinity = 'euclidean').fit(cluster_data)
# clusters = AP.fit_predict(cluster_data)
Counter(clusters)
names_per_cluster = defaultdict(list)
for i, label in enumerate(clusters):
    names_per_cluster[label].append(i2g[i])
# names_per_cluster = {label: names.append(i2g[i]) for i, label in enumerate(clusters)}
# names_per_cluster
names_per_cluster





In [29]:
import csv
import json

def read_operons(path):
    operons_per_id = defaultdict(list)
    with open(path, 'r') as infile:
        reader = csv.reader(infile, delimiter='\t')
        header = next(reader)
        for line in reader:
            operon_id = line[0]
            gene_id = line[2]
            operons_per_id[operon_id].append(gene_id)
    
    operons_per_gene = defaultdict(list)
    for operon_list in operons_per_id.values():
        for gene in operon_list:
            operons_per_gene[gene] = operon_list
    return operons_per_gene

true_operons = json.load(open('data/operons.json'))

def validate_operon(gene, predicted_operon):
    '''
    Validates the operon (set) of a gene (id string)
    on recall and precision of the operon members
    using the gold standard operon dict
    '''
    try:
        true_operon = set(true_operons[gene])
    except:
#         print("Gene not found in gold standard set")
        return
#     print("This gene is part of the following (true) operon:", true_operon)
#     print("The predicted operon for this gene is:", set(predicted_operon))
    recall = len(true_operon.intersection(predicted_operon))/len(true_operon)
#     print("The recall of the predicted operon is:", recall)
    precision = len(true_operon.intersection(predicted_operon))/len(predicted_operon)
#     print("The precision of the predicted operon is:", precision)
    correct = int(set(predicted_operon) == true_operon)
    return recall, precision, correct
       
    
def validate_operons(dictionary):
    recall_scores = []
    precision_scores = []
    predictions = []
    print('validating predicted operons...')
    for cluster in dictionary.values():
        for gene in cluster:
            score = validate_operon(gene, cluster)
            if score:
                recall_scores.append(score[0])
                precision_scores.append(score[1])
                predictions.append(score[2])
            else:
                continue
    mean_recall = np.mean(recall_scores)
    mean_precision = np.mean(precision_scores)
    accuracy = np.mean(predictions)
    return mean_recall, mean_precision, accuracy

benchmark = {gene: [gene] for gene in g2i}

r, p, a = validate_operons(benchmark)
print("The mean recall of the benchmark is:", r)
print("The mean precision of the benchmark is:", p)
print("The accuracy of the benchmark is:", a)
r, p, a = validate_operons(operons)

print("The mean recall of the assigned operons is:", r)
print("The mean precision of the assigned operons is:", p)
print("The accuracy of the assigned operons is:", a)


# def get_operon(gene):
    



validating predicted operons...
The mean recall of the benchmark is: 0.6034474017743979
The mean precision of the benchmark is: 1.0
The accuracy of the benchmark is: 0.40811153358681873
validating predicted operons...
The mean recall of the assigned operons is: 0.6938243744707624
The mean precision of the assigned operons is: 0.7299915504858471
The accuracy of the assigned operons is: 0.17465145754119138
