In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
#import glob

In [2]:
path =  '~/Documents/uni/bioinfo/data/coli/'

In [3]:
fname = 'ecoli_exp_data_COLOMBOS.txt'

In [6]:
df = pd.read_csv(path + fname, sep = '\t', comment = '#')

In [7]:
df.head()

Unnamed: 0,locustag,gene name,geneid/contrast_id,1,2,3,4,5,6,7,...,4441,4442,4443,4444,4445,4446,4447,4448,4449,4450
0,b0001,thrl,1,-0.47295,0.69807,0.092701,0.18179,0.026924,0.29329,0.14804,...,-0.24699,-0.083284,0.014055,-0.1668,-0.21535,0.029365,-0.12322,0.11169,0.20203,-0.032485
1,b0002,thra,2,-0.2655,1.5794,-0.038102,0.2357,-0.57493,-0.16614,0.027998,...,0.85379,1.5932,-0.77626,0.63566,1.6599,-12.508,0.22097,-12.942,-0.019066,0.7788
2,b0003,thrb,3,-0.2918,0.79956,-0.10811,-0.15724,-0.54296,-0.27575,0.32801,...,0.72517,0.68578,-0.76453,0.45752,1.4724,1.6704,0.003126,0.17728,-0.029748,0.69778
3,b0004,thrc,4,-0.031248,1.1768,-0.018437,0.45874,-0.12135,0.1148,-0.12736,...,0.82861,0.81249,-0.48505,0.56613,1.5155,1.1156,0.36996,0.11058,-0.073762,0.62392
4,b0005,yaax,5,0.10575,0.067098,0.19617,0.20015,-0.002578,0.23109,0.17282,...,0.033629,-0.44867,0.243,0.054078,-0.25889,-0.48283,-1.535,-1.826,0.10041,-0.33746


In [8]:
annot = df.iloc[:, :3]
data = df.iloc[:, 3:]

preprocess = make_pipeline(SimpleImputer( strategy = 'median'),
                           RobustScaler(), )

scaled_data = preprocess.fit_transform(data)

pca = PCA(0.8, random_state = 42).fit(scaled_data)

projected = pca.fit_transform(scaled_data)

reconstructed = pca.inverse_transform(projected)

reconstructed_df = pd.DataFrame(reconstructed, columns = data.columns.to_list())



In [16]:
projected.shape

(4321, 269)

In [11]:
denoised_df = pd.concat([annot, reconstructed_df], axis = 1)

In [14]:
denoised_df.tail()

Unnamed: 0,locustag,gene name,geneid/contrast_id,1,2,3,4,5,6,7,...,4441,4442,4443,4444,4445,4446,4447,4448,4449,4450
4316,b4702,mgtl,4701,-1.049055,-1.219444,-0.416038,-0.9045,-0.768559,-0.659163,0.571854,...,-0.565974,-0.780168,-0.073926,-0.49147,-0.575127,-0.143072,-0.409605,-0.05206,-0.356543,-0.354376
4317,b4703,pmrr,4699,0.407114,-0.06672,0.224369,-0.12527,0.017482,-0.100264,0.396176,...,0.146184,-0.403824,0.078008,0.142763,-0.225789,0.003277,-0.186449,-0.031627,0.045876,0.011985
4318,b4705,mnts,4670,-0.222992,0.514954,0.594634,0.763673,1.127944,1.01074,-0.575159,...,-0.730955,-0.570064,0.34531,-0.62553,-0.746653,-0.337484,-0.429035,-0.156321,-0.596378,0.014697
4319,b4706,irok,4687,0.047669,-0.038933,0.0266,-0.022646,0.032914,0.068793,0.007129,...,-0.0584,-0.032816,-0.022615,-0.037593,-0.017685,-0.084602,-0.086268,-0.036376,0.172799,-0.082507
4320,b4708,insi1,4665,-0.025681,-0.099536,-0.046278,-0.086917,-0.1017,-0.027586,-0.035073,...,-0.143663,-0.260362,0.166239,-0.119195,-0.131541,-0.326699,-0.424602,-0.382323,0.225034,-0.264579


In [15]:
denoised_df.to_csv('~/Downloads/denoised_coli.csv', index = False)

In [13]:
def download_and_preprocess_data(org, data_dir = None, variance_ratio = 0.8, 
                                output_path = '~/Downloads/'):
    
    """
    General function to download and preprocess dataset from Colombos. 
    Might have some issues for using with Windows. If you're using windows
    I recommend using the urllib for downloading the dataset. 
    
    Params
    -------
    
    
    data_path (str): path to directory + filename. If none it will download the data
                     from the internet. 
                     
    org (str) : Organism to work with. Available datasets are E. coli (ecoli), 
                B.subtilis (bsubt), P. aeruginosa (paeru), M. tb (mtube), etc. 
                Source: http://colombos.net/cws_data/compendium_data/
                
    variance (float): Fraction of the variance explained to make the PCA denoising. 
    
    Returns
    --------
    
    denoised (pd.DataFrame)
    
    """
    #Check if dataset is in directory
    if data_dir is None:
        
        !wget http://colombos.net/cws_data/compendium_data/{org}_compendium_data.zip
            
        ! unzip {org}_compendium_data.zip
        
        df = pd.read_csv('colombos_ecoli_exprdata_20151029.txt',
                         sep = '\t', skiprows= np.arange(6))
        
        # Lower case gene names
        df['Gene name'] = df['Gene name'].apply(lambda x: x.lower())
        
    else: 
        
        df = pd.read_csv(data_dir, sep = '\t', skiprows= np.arange(6))
    
    annot = df.iloc[:, :3]
    data = df.iloc[:, 3:]

    preprocess = make_pipeline(SimpleImputer( strategy = 'median'),
                               StandardScaler(), )

    scaled_data = preprocess.fit_transform(data)
    
    # Initialize PCA object
    pca = PCA(variance_ratio, random_state = 42).fit(scaled_data)
    
    # Project to PCA space
    projected = pca.fit_transform(scaled_data)
    
    # Reconstruct the dataset using 80% of the variance of the data 
    reconstructed = pca.inverse_transform(projected)

    # Save into a dataframe
    reconstructed_df = pd.DataFrame(reconstructed, columns = data.columns.to_list())

    # Concatenate with annotation data
    denoised_df = pd.concat([annot, reconstructed_df], axis = 1)
    
    
    # Export dataset 
    denoised_df.to_csv(output_path + 'denoised_' + org + '.csv', index = False)

In [14]:
download_and_preprocess_data('ecoli', data_dir = path + fname)