In [1]:
import GEOparse
import pandas as pd
import csv, json

def download_experiment(experiment):
    GEOparse.get_GEO(geo=experiment, destdir="data/")

def process_experiment(experiment):
    return GEOparse.get_GEO(filepath="data/" + experiment + "_family.soft.gz")

def aggregate_samples(experiment, download=True):  
    '''
    Wrapper function to download and process a given experiment ID,
    aggregate the samples and write them to a csv file as a matrix.
    '''
    if download:
        download_experiment(experiment)
    print('process experiment...')
    gse = process_experiment(experiment)
    aggregated_samples = pd.DataFrame()
    genes = pd.DataFrame()
    print('aggregating samples...')
    for sample in gse.gsms:
        name = gse.gsms[sample].metadata['title'][0]
        sample_df = gse.gsms[sample].table
        sample_df.columns = ['gene', name]
        genes = sample_df['gene']
        aggregated_samples = pd.concat([aggregated_samples, sample_df[name]], axis=1)    
    aggregated_samples = pd.concat([genes, aggregated_samples], axis=1)
    aggregated_samples.to_csv('data/' + experiment + '_full_matrix.csv') 
    print('aggregated samples written to csv file')

In [2]:
from collections import Counter, defaultdict
from scipy.stats import multivariate_normal
from sklearn.metrics import adjusted_rand_score
import numpy as np
import random

def distribution_parameters(gene, genes):
    gene_differences = np.array([g-gene for g in genes])
    covariance = np.cov(gene_differences, rowvar=False)
    mean = np.mean(gene_differences, axis=0)
    return mean, covariance

def score_multivariate(x, mean, covariance):
    return 1-multivariate_normal.pdf(x, mean=mean, cov=covariance)

def score_cosine(x, y):
    return (np.dot(x, np.transpose(y))/(np.linalg.norm(x)*np.linalg.norm(y)))
    
def sequential_similarities(data, similarity_function=score_cosine):
    print("calculating sequential similarity...")
    similarities = [1] + [similarity_function(data[i], data[i+1]) for i in range(0, len(data)-1)]
    return similarities

def similarities(data, similarity_function=score_cosine):
    print("calculating all similarities...")
    similarities = []
    for x in data:       
        similarities.append([similarity_function(x, y) for y in data])
    return similarities

def most_similar_samples(sample, sample_similarities, n=5):
    print('Finding most similar samples for', sample + '...')
    s = sample_similarities[s2i[sample]]
    best = reversed(np.argsort(s)[-n:])
    return [i2s[b] for b in best]

def threshold_operons_sequential(similarities, threshold=0.6):
    '''threshold a sequential list of similarities. 
    This means the spatial location of the genes is used
    '''
    cluster = 0
    threshold_operons = defaultdict(list)
    print('thresholding similarity into operons...')
    for i, sim in enumerate(similarities):
        if i == 0:
            threshold_operons[cluster].append(i2g[i])
            continue
        if sim > threshold:
            threshold_operons[cluster].append(i2g[i])
        else:
            cluster += 1
            threshold_operons[cluster].append(i2g[i])
    labels = [label for label, operon in threshold_operons.items() for _ in operon]
    print('done.')
    return labels

def read_gold_operons(path):
    operons_per_id = defaultdict(list)
    with open(path, 'r') as infile:
        reader = csv.reader(infile, delimiter='\t')
        header = next(reader)
        for line in reader:
            operon_id = line[0]
            gene_id = line[2]
            operons_per_id[operon_id].append(gene_id)
    labels_dict = {gene:label for label, operon in enumerate(operons_per_id.values()) for gene in operon}
    labels_ordered = [labels_dict[gene] for gene in i2g]
    return labels_ordered 

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.model_selection import KFold
                     
def is_operon_pair(gold_labels, gene1, gene2):
    return(int(gold_labels[gene1] == gold_labels[gene2]))

def select_features(X, y, f=50):
    k = SelectKBest(f_regression, k=f).fit(X, y)
    choose = k.get_support()
    chosen_features = [i2s[i] for i in range(0, len(choose)) if choose[i]]
    return k.transform(X), chosen_features

def k_fold(X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True)
    kf.get_n_splits(X)
    results = []
    for train_index, test_index in kf.split(X):
        logreg = LogisticRegression()
        logreg.fit(X[train_index], y[train_index])
        predictions = logreg.predict(X[test_index])
        correct = np.sum([1 for i, p in enumerate(predictions) if p==y[test_index][i]])
        accuracy = correct/len(y[test_index])
        results.append(accuracy)
    return results

### Reading the data

In [4]:
# aggregate_samples('GSE67023', download=False)
offset = 2
df = pd.read_csv('data/eichenberger_full_matrix.csv').fillna(0)
gold_gene_names = json.load(open('data/gold_gene_names.json'))
df = df[df['gene'].isin(set(gold_gene_names))]
gene_data = df.values[:, offset:].astype(float)
sample_data = df.values[:, offset:].astype(float).transpose()

i2s = list(df)[offset:] # index to sample name
s2i = {sample: i for i, sample in enumerate(i2s)} # sample name to index

i2g = df.values[:, 1] # index to gene name
g2i = {gene: i for i, gene in enumerate(i2g)} # gene name to index

gold_operons = read_gold_operons('data/1240.opr')

### Rand index for similarities

In [None]:


## Similarities
sequential_gene_similarities_cosine = sequential_similarities(gene_data)
sample_similarities_cosine = similarities(sample_data)

## Similar samples
random_sample = i2s[random.randint(0, len(i2s))]
print(most_similar_samples(random_sample, sample_similarities_cosine))

## Predictions
benchmark_operons = [i for i, gene in enumerate(i2g)]
predicted_operons = threshold_operons_sequential(sequential_gene_similarities_cosine)

## Validation
print('The adjusted Rand score of the benchmark is:',
      adjusted_rand_score(gold_operons, benchmark_operons))

print('The adjusted Rand score of the predicted operons is:',
      adjusted_rand_score(gold_operons, predicted_operons))


calculating sequential similarity...
calculating all similarities...
Finding most similar samples for Germ1_T90min...
['Germ1_T90min', 'Com1_T+0.5', 'Vanco1_T15', 'SigMT30_2', 'RemA1_T30']
thresholding similarity into operons...
done.
The adjusted Rand score of the benchmark is: 0.0
The adjusted Rand score of the predicted operons is: 0.7016422789112293


### Pairwise logistic regression

In [None]:
from sklearn.preprocessing import normalize
import json

sequential_target_names = [i2g[i]+'-'+i2g[i+1] for i in range(0, len(i2g)-2)]
sequential_target_values = np.array([is_operon_pair(gold_operons, i, i+1) 
                                     for i in range(0, len(gene_data)-2)])
sequential_data = np.array([np.square(gene_data[i]-gene_data[i+1]) 
                            for i in range(0, len(gene_data)-2)])
X, features = select_features(sequential_data, sequential_target_values, f=50)
results_all = k_fold(sequential_data, sequential_target_values, k=10)
results_selected = k_fold(X, sequential_target_values, k=10)
print("mean_accuracy for using all data:", np.mean(results_all))
print("results for selected data:", np.mean(results_selected))