# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os

# Feature Correspondence

In [2]:
fcorr = pd.read_csv('../1_preprocessing/featureCorrespondence.csv')

fcorr.rename(columns={"Unnamed: 0":'Features'}, inplace=True)

features = fcorr[['Features']]

fcorr['Features'] = features

fcorr.set_index('Features', inplace=True)

print(fcorr.shape)

fcorr.head()

(2213, 4)


Unnamed: 0_level_0,Control-T1,Control-TF,Inoculum-T1,Inoculum-TF
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FT0001,184778.727451,528966.910945,1142983.0,2431455.0
FT0002,137987.282315,272723.765382,55816.81,420089.2
FT0003,3513.988896,7862.624275,9350.261,50049.5
FT0004,68978.419273,82935.024448,50220.64,624460.3
FT0005,175146.487757,10430.641971,201464.8,5875561.0


# Only MS2

`featureCorrespondence.csv` all the features (== MS1), but all of them had an MS2.

Normally, I'll do this step after the MS2 annotation, but for this tutorial, we'll only work with the features that have MS2.

In [3]:
list_ms2 = open('../1_preprocessing/ms2spectra_consensus.mgf', 'r').read().split('\n')
list_ms2 = [x.split('=')[-1] for x in list_ms2 if x.startswith('TITLE')]
print(len(list_ms2))
list_ms2[:5]

186


['FT0002', 'FT0005', 'FT0044', 'FT0072', 'FT0076']

## Filter 

In [9]:
fcorr_ms2 = fcorr.copy()

fcorr_ms2 = fcorr_ms2.reset_index()

fcorr_ms2 = fcorr_ms2[fcorr_ms2['Features'].isin(list_ms2)]

fcorr_ms2 = fcorr_ms2.set_index('Features')

fcorr_ms2 = fcorr_ms2.replace(np.nan, 0)

## log transform
fcorr_ms2 = np.log10(fcorr_ms2+1)

print(fcorr_ms2.shape)

## filter rows that have variance == 0; will cause issues in the PCA
fcorr_ms2 = fcorr_ms2[fcorr_ms2.var(axis=1).astype(int) > 0.0]

print(fcorr_ms2.shape)

fcorr_ms2.to_csv('featureCorrespondence_MS2.csv')

fcorr_ms2.head()

(186, 4)
(88, 4)


Unnamed: 0_level_0,Control-T1,Control-TF,Inoculum-T1,Inoculum-TF
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FT0005,5.243404,4.018353,5.304201,6.769049
FT0072,3.36421,0.0,0.0,5.757312
FT0544,3.998755,3.555098,6.93044,3.779091
FT0772,5.995259,4.04393,5.790801,3.521604
FT0822,4.674981,4.608105,4.823483,6.937902


In [10]:
fcorr_ms2.loc['FT2069', :]

Control-T1     0.000000
Control-TF     0.000000
Inoculum-T1    6.425827
Inoculum-TF    6.256760
Name: FT2069, dtype: float64