In [85]:
import pandas as pd
import os
import numpy as np

In [107]:
def write_input_files(txt_file, index_col, gene_col, out_dir=""):
    print ("--------------------------------------------------")
    print (f"Formatting txt file: {txt_file}")
    # read df
    protein_df_annot = pd.read_csv(txt_file, sep="\t")

    # Identify annotation cols
    annotation_cols = [col for col in protein_df_annot.columns if ":" in col]


    # Get protein-gene map
    protgroup_gene_map = protein_df_annot.set_index(index_col)[gene_col].to_dict()

    # format protein df
    print (f"Using the following column as index:")
    print (f"\t{index_col}")
    print ()

    drop_cols = [col for col in annotation_cols if col != index_col]
    print (f"Dropping the following cols:")
    for col in drop_cols:
        print (f"\t{col}")
    print ()

    protein_df = protein_df_annot.set_index(index_col).drop(columns =drop_cols).dropna()
    # COunt and report nans
    nan_count = protein_df.isna().sum().sum()
    print(f"Number of NaNs in protein_df: {nan_count}")

    # Log transform protein df
    protein_df = np.log2(protein_df + 1)

    # Proteins as columns
    protein_df =protein_df.rename_axis("Sample").T

    # COunt and report nans
    nan_count = protein_df.isna().sum().sum()
    print(f"Number of NaNs in protein_df: {nan_count}")

    if out_dir != "":
        os.makedirs(out_dir, exist_ok=True)

    # save protein_df
    base_name = txt_file.split(".")[0]
    out_file = f"{out_dir}/protein_df_{base_name}.csv"
    protein_df.to_csv(out_file)
    print (f"df saved to {out_file}")

    # save protgroup gene map
    out_file = f"{out_dir}/protgroup_gene_map_{base_name}.json"
    import json
    with open(out_file, 'w') as f:
        json.dump(protgroup_gene_map, f, indent=2)
    print (f"map saved to {out_file}")
    print ()


In [108]:
txt_files = [
 'Imputed_MatrixExport_NP_A.txt',
 'Imputed_MatrixExport_NP_B.txt',
]

index_col = "T: T: Protein.Group"
gene_col = "T: T: Genes"
out_dir = "input_files"

for txt_file in txt_files:
    write_input_files(txt_file, index_col, gene_col, out_dir)

--------------------------------------------------
Formatting txt file: Imputed_MatrixExport_NP_A.txt
Using the following column as index:
	T: T: Protein.Group

Dropping the following cols:
	N: Number Of Imputations
	T: T: Protein.Ids
	T: T: Protein.Names
	T: T: Genes
	T: T: First.Protein.Description

Number of NaNs in protein_df: 0


  result = func(self.values, **kwargs)


Number of NaNs in protein_df: 21257
df saved to input_files/protein_df_Imputed_MatrixExport_NP_A.csv
map saved to input_files/protgroup_gene_map_Imputed_MatrixExport_NP_A.json

--------------------------------------------------
Formatting txt file: Imputed_MatrixExport_NP_B.txt
Using the following column as index:
	T: T: Protein.Group

Dropping the following cols:
	N: Number Of Imputations
	T: T: Protein.Ids
	T: T: Protein.Names
	T: T: Genes
	T: T: First.Protein.Description

Number of NaNs in protein_df: 0
Number of NaNs in protein_df: 68241


  result = func(self.values, **kwargs)


df saved to input_files/protein_df_Imputed_MatrixExport_NP_B.csv
map saved to input_files/protgroup_gene_map_Imputed_MatrixExport_NP_B.json



In [109]:
txt_files = [
 'MatrixExport_NP_A.txt',
 'MatrixExport_NP_B.txt',]

index_col = "T: Protein.Group"
gene_col = "T: Genes"
out_dir = "input_files"

for txt_file in txt_files:
    write_input_files(txt_file, index_col, gene_col, out_dir)

--------------------------------------------------
Formatting txt file: MatrixExport_NP_A.txt
Using the following column as index:
	T: Protein.Group

Dropping the following cols:
	T: Protein.Ids
	T: Protein.Names
	T: Genes
	T: First.Protein.Description

Number of NaNs in protein_df: 0
Number of NaNs in protein_df: 0
df saved to input_files/protein_df_MatrixExport_NP_A.csv
map saved to input_files/protgroup_gene_map_MatrixExport_NP_A.json

--------------------------------------------------
Formatting txt file: MatrixExport_NP_B.txt
Using the following column as index:
	T: Protein.Group

Dropping the following cols:
	T: Protein.Ids
	T: Protein.Names
	T: Genes
	T: First.Protein.Description

Number of NaNs in protein_df: 0
Number of NaNs in protein_df: 0
df saved to input_files/protein_df_MatrixExport_NP_B.csv
map saved to input_files/protgroup_gene_map_MatrixExport_NP_B.json



## Debug

In [112]:
txt_file = "Imputed_MatrixExport_NP_A.txt"
index_col = "T: T: Protein.Group"
gene_col = "T: T: Genes"

In [113]:
    print ("--------------------------------------------------")
    print (f"Formatting txt file: {txt_file}")
    # read df
    protein_df_annot = pd.read_csv(txt_file, sep="\t")

    # Identify annotation cols
    annotation_cols = [col for col in protein_df_annot.columns if ":" in col]


    # Get protein-gene map
    protgroup_gene_map = protein_df_annot.set_index(index_col)[gene_col].to_dict()

    # format protein df
    print (f"Using the following column as index:")
    print (f"\t{index_col}")
    print ()

    drop_cols = [col for col in annotation_cols if col != index_col]
    print (f"Dropping the following cols:")
    for col in drop_cols:
        print (f"\t{col}")
    print ()

    protein_df = protein_df_annot.set_index(index_col).drop(columns =drop_cols).dropna()

--------------------------------------------------
Formatting txt file: Imputed_MatrixExport_NP_A.txt
Using the following column as index:
	T: T: Protein.Group

Dropping the following cols:
	N: Number Of Imputations
	T: T: Protein.Ids
	T: T: Protein.Names
	T: T: Genes
	T: T: First.Protein.Description



In [120]:
protein_df.isna().sum().sum()

0

In [121]:

np.log2(protein_df + 1).isna().sum().sum()

  result = func(self.values, **kwargs)


21257

In [123]:
protein_df[protein_df < 0].stack()



T: T: Protein.Group         
A0A024R4E5           3591528   -19686000.0
                     3448110   -14836200.0
                     3478950   -17954000.0
                     1648785   -25321800.0
                     3549432   -22303100.0
                                   ...    
P49662               3330416   -18291800.0
P49746               3581023   -19071800.0
P49754               3338280    -9035460.0
                     3763493   -20228500.0
                     3291923   -21375100.0
Length: 21257, dtype: float64

In [97]:
protein_df = protein_df.rename_axis("Sample").T
protein_df = np.log2(protein_df + 1)

In [98]:
protein_df

Sample,A0A067XG54;A0A804HIW2,A0A075B6H7,A0A075B6I9,A0A075B6K5,A0A075B6P5,A0A087WSY6,A0A0A0MRZ8;P04433,A0A0A0MS15,A0A0B4J1U7,A0A0B4J1V0,...,Q9Y678,Q9Y696,Q9Y6C2,Q9Y6E0;Q9Y6E0-2,Q9Y6E2,Q9Y6R7,Q9Y6W5,Q9Y6Y8,Q9Y6Z7,V9GYJ8
3716389,15.406570,18.692094,16.865407,15.662426,18.085426,16.113056,17.570767,17.397858,19.260279,16.789674,...,16.715198,15.045337,17.791201,19.260734,17.567080,14.517399,14.715351,14.429701,23.949491,18.915112
3351739,16.252808,17.698345,18.876010,16.227879,18.336703,15.052105,17.101596,15.269864,17.047731,16.090241,...,14.590827,13.838396,19.370189,17.801298,15.709165,14.009969,14.053790,15.678927,23.516531,17.195113
46627359,15.952980,18.605288,16.622767,15.001773,19.130226,16.311596,18.113849,17.182423,17.483186,17.460800,...,16.841417,15.385428,19.248173,19.835979,18.115354,12.960543,16.166459,15.456011,23.697103,17.639856
1681613,16.372039,18.521390,20.722341,15.960407,17.819568,15.859511,17.941758,17.469364,17.475995,15.957277,...,15.351029,13.999727,18.725536,18.815280,16.987830,14.719234,15.006900,15.484694,23.377825,18.337374
3542234,15.760195,17.638994,18.235953,18.553434,17.523914,15.229848,17.329025,17.115338,16.396078,15.751816,...,15.970569,13.907501,16.696479,18.845959,16.888375,13.956957,14.627956,15.295403,23.654035,17.863581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3581894,15.561097,20.467622,18.326803,17.126049,20.645265,18.091389,20.902423,18.502331,17.748593,16.653825,...,15.226239,14.137648,16.925531,18.619575,16.950750,17.577967,14.365564,14.408768,23.540378,18.330510
46975089,15.489933,17.943253,16.198967,15.505929,17.362355,14.693961,16.895279,15.878671,17.440066,14.745375,...,18.836525,16.680016,18.377494,21.253497,18.919364,14.306282,17.561765,15.852693,22.039257,15.127797
3819910,16.009098,18.377952,16.199299,15.742652,18.571538,15.744663,18.629296,17.557927,18.221115,15.610930,...,14.783505,13.586418,17.242244,18.280562,16.480501,14.123830,14.216852,15.888384,24.125340,18.267226
473627422593,15.985528,18.410398,15.644783,15.028325,17.531717,15.305470,17.274078,16.628161,16.451657,15.385070,...,15.223195,14.279915,18.719903,17.995403,15.960260,15.307055,14.177381,16.958360,23.586921,17.965542


In [83]:
# Log transform protein df
import numpy as np
protein_df_log = np.log2(protein_df + 1)
print("Log2 transformed protein data:")
protein_df_log


Log2 transformed protein data:


Unnamed: 0_level_0,3716389,3351739,46627359,1681613,3542234,4010630,3812449,46521003,3285004,3895058,...,3657877,3901749,3383268,3631319,46994949,3581894,46975089,3819910,473627422593,3497568
T: Protein.Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A067XG54;A0A804HIW2,15.406570,16.252808,15.952980,16.372039,15.760195,15.434185,16.626165,16.298945,16.243369,15.531284,...,15.972435,16.200996,14.947473,15.446205,15.190681,15.561097,15.489933,16.009098,15.985528,14.965631
A0A075B6H7,18.692094,17.698345,18.605288,18.521390,17.638994,18.083786,18.826083,17.828124,18.094253,20.182531,...,17.741013,18.578480,21.101494,18.302773,18.373961,20.467622,17.943253,18.377952,18.410398,17.496557
A0A075B6I9,16.865407,18.876010,16.622767,20.722341,18.235953,18.126921,17.978213,15.739106,15.750411,16.673903,...,16.548566,18.797066,16.877835,16.525324,17.020687,18.326803,16.198967,16.199299,15.644783,16.181028
A0A075B6K5,15.662426,16.227879,15.001773,15.960407,18.553434,14.851895,15.197155,14.157142,14.755414,15.651668,...,15.491862,16.521558,15.428789,15.135713,15.735516,17.126049,15.505929,15.742652,15.028325,14.118398
A0A075B6P5,18.085426,18.336703,19.130226,17.819568,17.523914,18.609413,18.356353,17.006425,18.960027,18.629001,...,19.885078,18.425490,18.100365,18.245659,18.525402,20.645265,17.362355,18.571538,17.531717,17.543372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y6R7,14.517399,14.009969,12.960543,14.719234,13.956957,16.278141,13.516451,13.926454,14.081550,14.061945,...,13.759909,15.124739,13.720073,14.326633,14.925809,17.577967,14.306282,14.123830,15.307055,13.063714
Q9Y6W5,14.715351,14.053790,16.166459,15.006900,14.627956,14.156360,13.737670,13.803102,13.642311,17.134135,...,14.250891,15.470395,14.813330,13.514677,14.536624,14.365564,17.561765,14.216852,14.177381,17.421268
Q9Y6Y8,14.429701,15.678927,15.456011,15.484694,15.295403,16.153273,16.116476,14.391606,14.202308,14.596260,...,15.613531,17.512386,16.090584,14.688256,16.011166,14.408768,15.852693,15.888384,16.958360,15.655687
Q9Y6Z7,23.949491,23.516531,23.697103,23.377825,23.654035,23.885765,23.759388,23.442531,23.819094,23.296141,...,24.069072,23.504458,24.060852,24.133203,24.391000,23.540378,22.039257,24.125340,23.586921,22.247715
