# What is about ? 

Simple Baseline to start with : Covert MultiLabel to MultiTarge  + Embeddings + Ridge 

    Features - precalculated embeddings for protein sequences. Thanks to Grandmaster Sergei Fironov for sharing protein emebedding calculated by T5 protein language model from the Rost Lab. 
    
    Targets - multi-label is converted to mult-target (binary classification) task - i.e. for each sample we are preciting the probability that this label is assigned to that sample. In total there can be 40 000 labels - that is too much, so we choose only N the most frequent ones. 
    
    After that - use any ML-model you like to make predictions. Start with Ridge as the he most simple and fast one. 
    

Thanks to all  authors of the public notebooks and datasets which are quite helpful (please upvote them) and especially those ones:

LEONID KULYK: https://www.kaggle.com/code/leonidkulyk/eda-cafa5-pfp-interactive-dags-plotly

MARÍLIA PRATA: https://www.kaggle.com/code/mpwolke/cafa-5-protein-prediction

DAREK KŁECZEK:  https://www.kaggle.com/code/thedrcat/cafa-eda

D_KHATRI:  https://www.kaggle.com/code/dhruvkhatri/naive-submission-afa

* Pretrained T5 protein embeddings: 
    * https://www.kaggle.com/datasets/danofer/uniprotkbswiss-prot-protein-embeddings

Grandmaster Sergei Fironov shared protein emebedding calculated by T5 protein language model from the Rost Lab:  https://www.kaggle.com/datasets/sergeifironov/t5embeds



# Key param(s)



In [None]:
n_labels_to_consider = 1499 # We will choose only top frequent labels (in train) and predict only them. 
n_max_preds = 1499

In [None]:
import time
t0start = time.time() 

import numpy as np
import pandas as pd 
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge,RidgeCV
from sklearn.neural_network import MLPClassifier

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Prepare multi-target Y  ( transition from multi-label task to multi target task - binary classifiction ). 

## Load train labels and select the most frequent ones

In [None]:
%%time
trainTerms = pd.read_csv("/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv",sep="\t")
print(trainTerms.shape)
display(trainTerms.head(2))
vec_freqCount = (trainTerms['term'].value_counts())
print(vec_freqCount )

In [None]:
## drop very rares
vec_freqCount = vec_freqCount[vec_freqCount>=30]
print(vec_freqCount.shape[0])
vec_freqCount.describe().round()

In [None]:
vec_freqCount[vec_freqCount>200].shape[0]

In [None]:
print()
labels_to_consider = list(vec_freqCount.index[:n_labels_to_consider] )
print('n_labels_to_consider:', len(labels_to_consider), 'First 10:', labels_to_consider[:10] ) 

## Load protein Ids in train

In [None]:
%%time
fn = '/kaggle/input/t5embeds/train_ids.npy'
vec_train_protein_ids = np.load(fn)
print(vec_train_protein_ids.shape)
vec_train_protein_ids

## Prepare Y 

In [None]:
%%time 
train_size = 142246 # len(X)
Y = np.zeros( (train_size ,n_labels_to_consider) )
print(Y.shape)

series_train_protein_ids = pd.Series(vec_train_protein_ids ) # 

trainTerms_smaller = trainTerms[ trainTerms['term'].isin( labels_to_consider ) ] # to speed-up the next step 
print( trainTerms_smaller.shape)

for i in range(Y.shape[1]):
    m = trainTerms_smaller['term'] ==  labels_to_consider[i]
#     m.sum()
    Y[:,i] =  series_train_protein_ids.isin(  set(trainTerms_smaller[m]['EntryID'] ) ).astype(float )
    if (i % 10) == 0: 
        print(i, m.sum())
Y 

In [None]:
%%time 
# save for possible future reuse 
fn4saveY = 'Y_'+str(Y.shape[1])
print(fn4saveY)
np.save( fn4saveY , Y) 

In [None]:
%%time
fn4save_labels = 'Y_'+str(Y.shape[1]) + '_labels'
np.save(fn4save_labels, labels_to_consider )

In [None]:
# print( list(np.load(fn4save_labels +'.npy' ))[:10] )

In [None]:
%%time 
# Someone may prefer  Y as dataframe 
if 1:
    df_Y = pd.DataFrame(data = Y, columns = labels_to_consider)
    display(df_Y.head(2))
#     print( df.info().sum() )
    print('memory_usage:', df_Y.memory_usage(index=True).sum() )
    display(df_Y.describe() )    
    fn4save =  'df_Y_'+str(Y.shape[1]) + '.csv'
    df_Y.to_csv(fn4save)

# Load train features - precalculated embeddings for the proteins

In [None]:
%%time

# fn = '/kaggle/input/protein-embeddings-1/reduced_embeddings_file.npy'
# fn = '/kaggle/input/protein-embeddings-1/embed_protbert_train_clip_1200_first_70000_prot.csv'
fn = '/kaggle/input/t5embeds/train_embeds.npy'
# fn = '/kaggle/input/t5embeds/test_embeds.npy'

print(fn)
if '.csv' in fn:
    df = pd.read_csv(fn, index_col = 0)
    X = df.values
elif '.npy' in fn:
    X = np.load(fn)
print(X.shape)
X

## Load protein Ids 

In [None]:
%%time
fn = '/kaggle/input/t5embeds/train_ids.npy'
vec_train_protein_ids = np.load(fn)
print(vec_train_protein_ids.shape)
vec_train_protein_ids

## Sanity check 

Ids from the train data are the same as from the train labels data

In [None]:
s = set(vec_train_protein_ids) &set (trainTerms['EntryID'] )
print( len(s), len( X ) )  # get same numbers 

# Prepare Train-Test split 

In [None]:
IX = np.arange(len(X))
IX_train, IX_test, _,_ = train_test_split( IX, IX, train_size=0.1, random_state=42)
print(len(IX_train), len(IX_test),  IX_train[:10], IX_test[:10] )

# Modeling

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# model = Ridge(alpha=1.0) # 0.805
model = RidgeCV() # 0.8127 auc, with 15% train
# model = RandomForestClassifier(n_estimators=200,  max_depth=14, min_samples_split=3, min_samples_leaf=1,n_jobs=-1) ## much slower... 
# model =MLPClassifier(hidden_layer_sizes=(512,256), early_stopping=True,
#                      validation_fraction=0.05,learning_rate="adaptive",learning_rate_init=0.005) # 0.59 rocauc , and slower
str_model_id = 'Ridge1'

df_models_stat = pd.DataFrame()
model

In [None]:
%%time 
import time
from sklearn.metrics import roc_auc_score

t0 = time.time()
model.fit(X[IX_train,:],Y[IX_train,:])
Y_pred_test = model.predict(X[IX_test,:])
tt = time.time() - t0
print(str_model_id, tt)
l = []
for i in range(Y.shape[1]):
    if len(np.unique(Y[IX_test,i]) ) > 1:
        s = roc_auc_score(Y[IX_test,i], Y_pred_test[:,i]);
    else:
        s = 0.5
    l.append(s)        
    if i %10 == 0:
        print(i, s)
df_models_stat.loc[str_model_id,'RocAuc Mean Test'] = np.mean(l)
df_models_stat.loc[str_model_id,'Time'] = np.round(tt,1)
df_models_stat.loc[str_model_id,'Test Size'] = len(IX_test)
df_models_stat

In [None]:
model.get_params()

## Scores statistics over targets 

In [None]:
import matplotlib.pyplot as plt
plt.hist(l)
plt.show()
pd.Series(l).describe()

# Retrain model on the the full sample 

In [None]:
%%time
model.fit(X,Y)

# Submission preparations Step 1 - load features and calculate predictions 

## Load features for submission

In [None]:
%%time
# fn = '/kaggle/input/protein-embeddings-1/reduced_embeddings_file.npy'
# fn = '/kaggle/input/protein-embeddings-1/embed_protbert_train_clip_1200_first_70000_prot.csv'
# fn = '/kaggle/input/t5embeds/train_embeds.npy'
fn = '/kaggle/input/t5embeds/test_embeds.npy'
print(fn)
X_submit = np.load(fn)
print(X_submit.shape)
# X_submit

## Calculate prediction for submission

In [None]:
%%time
Y_submit =  model.predict(X_submit)
print(Y_submit.shape)

# Submission preparations Step 2 - prepare submision in desired format  

In [None]:
%%time 
df_finalSubmission = pd.DataFrame(columns = ['Protein Id', 'GO Term Id','Prediction'])

## Load protein ids for the submission

In [None]:
%%time
fn = '/kaggle/input/t5embeds/test_ids.npy'
vec_test_protein_ids = np.load(fn)
print(vec_test_protein_ids.shape)
vec_test_protein_ids

## "Melt" protein ids 

In [None]:
%%time 
l = []
for k in list(vec_test_protein_ids):
    l += [ k] * Y_submit.shape[1]
print(len(l), l[:20])    

df_finalSubmission['Protein Id'] = l

In [None]:
# %%time 
# df_finalSubmission.head(3)

## "Melt" Labels (Gene ontology terms )

In [None]:
df_finalSubmission['GO Term Id'] = labels_to_consider * Y_submit.shape[0]
# df_finalSubmission.head(3)

## Assign predictions 

In [None]:
df_finalSubmission['Prediction'] = Y_submit.ravel()

In [None]:
display(df_finalSubmission)

### drop 0 preds and negatives
* opt: sort by score, keep top K per Protein
* warning : will be slooow with this many rows!

In [None]:
%%time
df_finalSubmission['Prediction'] = df_finalSubmission['Prediction'].round(3)
df_finalSubmission = df_finalSubmission[df_finalSubmission['Prediction']>0]
df_finalSubmission.shape[0]

In [None]:
df_finalSubmission

## Save 

In [None]:
%%time 
df_finalSubmission.to_csv("submission.tsv",header=False, index=False, sep="\t")

## Show some info 

In [None]:
# %%time 
# df_finalSubmission.info()

In [None]:
%%time 
df_finalSubmission.describe()

In [None]:
%%time
plt.figure(figsize = (15,4))
plt.hist(df_finalSubmission['Prediction'].values, bins = 300 )
plt.show()

In [None]:
df_finalSubmission

In [None]:
df_finalSubmission.iloc[:,0:2].nunique()

In [None]:
df_finalSubmission.shape[0]/141864 ## num proteins

In [None]:
df_finalSubmission.shape[0]/n_labels_to_consider

In [None]:
nan