In [1]:
import os as os
import numpy as np
import pandas as pd
import glob as glob

### Navigate to working directory

In [2]:
os.chdir('/Users/npatin3/Desktop/GTPostdoc/Noro_Assemblies')

### Import the "master" KO htext file and reformat it

In [4]:
ko_master = pd.read_csv('KO_Orthology_ko00001.txt', sep='\t', header=None,
                       names=['Group','Subgroup1','Subgroup2','KO'])
ko_master.head()

Unnamed: 0,Group,Subgroup1,Subgroup2,KO
0,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K00844 HK; hexokinase [EC:2.7.1.1]
1,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K12407 GCK; glucokinase [EC:2.7.1.2]
2,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K00845 glk; glucokinase [EC:2.7.1.2]
3,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],"K01810 GPI, pgi; glucose-6-phosphate isomeras..."
4,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],"K06859 pgi1; glucose-6-phosphate isomerase, a..."


In [5]:
# Split the function from the KO
ko_master[['KO', 'Function']] = ko_master['KO'].str.split(" ", n=1, expand=True)
#kos = ko_master.drop('Function', axis=1)
ko_master.head()

Unnamed: 0,Group,Subgroup1,Subgroup2,KO,Function
0,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K00844,HK; hexokinase [EC:2.7.1.1]
1,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K12407,GCK; glucokinase [EC:2.7.1.2]
2,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K00845,glk; glucokinase [EC:2.7.1.2]
3,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K01810,"GPI, pgi; glucose-6-phosphate isomerase [EC:5..."
4,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K06859,"pgi1; glucose-6-phosphate isomerase, archaeal..."


### Import the KO list and make a pandas list from it

In [30]:
kos = pd.read_csv("Asymp_vs_symp/KOs_SD_functions.csv", header=0, sep=',')
ko_list = list(kos['KO'])

In [31]:
ko_list

['K01803',
 'K15633',
 'K01785',
 'K01223',
 'K01676',
 'K01677',
 'K00036',
 'K01815',
 'K00963',
 'K00847',
 'K01809',
 'K02768',
 'K02769',
 'K01190',
 'K05349',
 'K00975',
 'K21574',
 'K00705',
 'K01207',
 'K00820',
 'K12452',
 'K00656',
 'K00925',
 'K01649',
 'K01966',
 'K01091',
 'K01915',
 'K01580',
 'K01652',
 'K00343',
 'K00297',
 'K03385',
 'K06881',
 'K01738',
 'K02078',
 'K09458',
 'K16363',
 'K01897',
 'K01130',
 'K00764',
 'K03787',
 'K03783',
 'K02428',
 'K01951',
 'K21636',
 'K00951',
 'K01939',
 'K01591',
 'K01937',
 'K00876',
 'K00757',
 'K01520',
 'K11358',
 'K01744',
 'K00133',
 'K14155',
 'K00549',
 'K07173',
 'K01740',
 'K00821',
 'K00620',
 'K00145',
 'K01585',
 'K01480',
 'K00817',
 'K04517',
 'K00384',
 'K01874',
 'K01776',
 'K01270',
 'K05515',
 'K15923',
 'K04487',
 'K03517',
 'K02227',
 'K04720',
 'K00806',
 'K00973',
 'K02996',
 'K02959',
 'K02939',
 'K02871',
 'K01872',
 'K01868',
 'K01892',
 'K03100',
 'K04079',
 'K11927',
 'K12573',
 'K03654',
 'K03628',

## Option 1: Use the Master KO file to assign a function to each of the KOs in ko_list

In [32]:
ko_fxns = ko_master[['KO','Function']]
ko_fxns.head()

In [18]:
merged = pd.merge(kos, ko_fxns, on='KO', how='inner')
merged.head()

In [31]:
merged.shape

(504, 2)

#### Because KOs fall into multiple categories, there will be multiple entries. Keep only one.

In [35]:
merged.drop_duplicates(keep='first', inplace=True)
merged.shape

In [37]:
merged.to_csv("KO_fxns.csv", index=None)

## Option 2: Extract category and function assignments of the master file using the KO list

In [33]:
# only KOs that are in the ko_list
kos_funct = ko_master.loc[ko_master['KO'].isin(ko_list), :]
kos_funct.head()

Unnamed: 0,Group,Subgroup1,Subgroup2,KO,Function
23,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K01803,"TPI, tpiA; triosephosphate isomerase (TIM) [E..."
29,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K15633,"gpmI; 2,3-bisphosphoglycerate-independent pho..."
75,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K01785,"galM, GALM; aldose 1-epimerase [EC:5.1.3.3]"
93,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K01223,"E3.2.1.86B, bglA; 6-phospho-beta-glucosidase ..."
140,09100 Metabolism,09101 Carbohydrate metabolism,00020 Citrate cycle (TCA cycle) [PATH:ko00020],K01676,"E4.2.1.2A, fumA, fumB; fumarate hydratase, cl..."


In [34]:
# Split out the number label for groups and subgroups
kos_funct[['Subgroup2', 'PATH']] = kos_funct['Subgroup2'].str.split("[", n=1, expand=True)
kos_funct = kos_funct.drop('PATH', axis=1)
kos_funct.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Group,Subgroup1,Subgroup2,KO,Function
23,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K01803,"TPI, tpiA; triosephosphate isomerase (TIM) [E..."
29,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K15633,"gpmI; 2,3-bisphosphoglycerate-independent pho..."
75,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K01785,"galM, GALM; aldose 1-epimerase [EC:5.1.3.3]"
93,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K01223,"E3.2.1.86B, bglA; 6-phospho-beta-glucosidase ..."
140,09100 Metabolism,09101 Carbohydrate metabolism,00020 Citrate cycle (TCA cycle),K01676,"E4.2.1.2A, fumA, fumB; fumarate hydratase, cl..."


In [35]:
# Split out the number label for groups and subgroups
kos_funct[['GroupNum', 'Group']] = kos_funct['Group'].str.split(" ", n=1, expand=True)
kos_funct = kos_funct.drop('GroupNum', axis=1)
kos_funct.head()

Unnamed: 0,Group,Subgroup1,Subgroup2,KO,Function
23,Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K01803,"TPI, tpiA; triosephosphate isomerase (TIM) [E..."
29,Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K15633,"gpmI; 2,3-bisphosphoglycerate-independent pho..."
75,Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K01785,"galM, GALM; aldose 1-epimerase [EC:5.1.3.3]"
93,Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K01223,"E3.2.1.86B, bglA; 6-phospho-beta-glucosidase ..."
140,Metabolism,09101 Carbohydrate metabolism,00020 Citrate cycle (TCA cycle),K01676,"E4.2.1.2A, fumA, fumB; fumarate hydratase, cl..."


In [36]:
# Split out the number label for groups and subgroups
kos_funct[['GroupNum', 'Subgroup1']] = kos_funct['Subgroup1'].str.split(" ", n=1, expand=True)
kos_funct = kos_funct.drop('GroupNum', axis=1)
kos_funct.head()

Unnamed: 0,Group,Subgroup1,Subgroup2,KO,Function
23,Metabolism,Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K01803,"TPI, tpiA; triosephosphate isomerase (TIM) [E..."
29,Metabolism,Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K15633,"gpmI; 2,3-bisphosphoglycerate-independent pho..."
75,Metabolism,Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K01785,"galM, GALM; aldose 1-epimerase [EC:5.1.3.3]"
93,Metabolism,Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis,K01223,"E3.2.1.86B, bglA; 6-phospho-beta-glucosidase ..."
140,Metabolism,Carbohydrate metabolism,00020 Citrate cycle (TCA cycle),K01676,"E4.2.1.2A, fumA, fumB; fumarate hydratase, cl..."


In [37]:
# Split out the number label for groups and subgroups
kos_funct[['GroupNum', 'Subgroup2']] = kos_funct['Subgroup2'].str.split(" ", n=1, expand=True)
kos_funct = kos_funct.drop('GroupNum', axis=1)
kos_funct.head()

Unnamed: 0,Group,Subgroup1,Subgroup2,KO,Function
23,Metabolism,Carbohydrate metabolism,Glycolysis / Gluconeogenesis,K01803,"TPI, tpiA; triosephosphate isomerase (TIM) [E..."
29,Metabolism,Carbohydrate metabolism,Glycolysis / Gluconeogenesis,K15633,"gpmI; 2,3-bisphosphoglycerate-independent pho..."
75,Metabolism,Carbohydrate metabolism,Glycolysis / Gluconeogenesis,K01785,"galM, GALM; aldose 1-epimerase [EC:5.1.3.3]"
93,Metabolism,Carbohydrate metabolism,Glycolysis / Gluconeogenesis,K01223,"E3.2.1.86B, bglA; 6-phospho-beta-glucosidase ..."
140,Metabolism,Carbohydrate metabolism,Citrate cycle (TCA cycle),K01676,"E4.2.1.2A, fumA, fumB; fumarate hydratase, cl..."


In [38]:
kos_funct.shape

(420, 5)

#### Exclude irrelevant categories

In [39]:
searchfor = ['Human Diseases','Brite Hierarchies','Organismal Systems',
                                    'Cellular community - eukaryotes']
kos_funct2 = kos_funct.loc[~kos_funct['Group'].str.contains('|'.join(searchfor))]

In [40]:
uniq_kos = kos_funct2.KO.unique()
len(uniq_kos)

164

In [28]:
kos_funct2.shape

(63, 5)

In [29]:
kos_funct2.to_csv("MAG_SDKOs_p05_CORRECTED_funct.csv", index=None)

### Groupby various levels and get counts for each level

In [67]:
kos_funct2.groupby(['Group']).count()

Unnamed: 0_level_0,Subgroup1,Subgroup2,KO,Function
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cellular Processes,19,19,19,19
Environmental Information Processing,34,34,34,34
Genetic Information Processing,21,21,21,21
Metabolism,281,281,281,281
Not Included in Pathway or Brite,76,76,76,76


In [69]:
subgroups1 = kos_funct2.groupby(['Subgroup1']).count()

In [71]:
subgroups1.to_csv("MAG_SDKOs_p02_subgroups1.csv")