# Summary

Focusing only on gene_id, submit the average value of gene_id.

In this case, cell_id is not used.

# Data preparation

I referred to [@peterholderrieth](https://www.kaggle.com/peterholderrieth)'s notebook. (https://www.kaggle.com/code/peterholderrieth/getting-started-data-loading)

In [1]:
!pip install --quiet tables

[0m

In [2]:
import os
import pandas as pd

In [3]:
os.listdir("/kaggle/input/open-problems-multimodal/")

['sample_submission.csv',
 'train_cite_targets.h5',
 'test_multi_inputs.h5',
 'evaluation_ids.csv',
 'train_cite_inputs.h5',
 'train_multi_targets.h5',
 'train_multi_inputs.h5',
 'metadata.csv',
 'test_cite_inputs.h5']

In [4]:
DATA_DIR = "/kaggle/input/open-problems-multimodal/"

SUBMISSON = os.path.join(DATA_DIR,"sample_submission.csv")

EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

## Citeseq

In [5]:
df_cite_train_y = pd.read_hdf('../input/open-problems-multimodal/train_cite_targets.h5')
df_cite_train_y.head()

gene_id,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,1.167804,0.62253,0.106959,0.324989,3.331674,6.426002,1.480766,-0.728392,-0.468851,-0.073285,...,-0.44839,3.220174,-0.533004,0.674956,-0.006187,0.682148,1.398105,0.414292,1.780314,0.54807
d02759a80ba2,0.81897,0.506009,1.078682,6.848758,3.524885,5.279456,4.930438,2.069372,0.333652,-0.468088,...,0.323613,8.407108,0.131301,0.047607,-0.243628,0.547864,1.832587,0.982308,2.736507,2.184063
c016c6b0efa5,-0.356703,-0.422261,-0.824493,1.137495,0.518924,7.221962,-0.375034,1.738071,0.142919,-0.97146,...,1.348692,4.888579,-0.279483,-0.131097,-0.177604,-0.689188,9.013709,-1.182975,3.958148,2.8686
ba7f733a4f75,-1.201507,0.149115,2.022468,6.021595,7.25867,2.792436,21.708519,-0.137913,1.649969,-0.75468,...,1.504426,12.391979,0.511394,0.587863,-0.752638,1.714851,3.893782,1.799661,1.537249,4.407671
fbcf2443ffb2,-0.100404,0.697461,0.625836,-0.298404,1.369898,3.254521,-1.65938,0.643531,0.90271,1.291877,...,0.777023,6.496499,0.279898,-0.84195,-0.869419,0.675092,5.259685,-0.835379,9.631781,1.765445


In [6]:
cite_gene_id_mean = df_cite_train_y.mean()
cite_gene_id_mean

gene_id
CD86     0.454768
CD274    0.466369
CD270    0.918049
CD155    5.610967
CD112    5.307707
           ...   
HLA-E    0.777902
CD82     5.375080
CD101    0.629204
CD88     3.608978
CD224    3.418748
Length: 140, dtype: float32

## Multiome

In [7]:
START = int(1e4)
STOP = START+10000

In [8]:
df_multi_train_y = pd.read_hdf(FP_MULTIOME_TRAIN_TARGETS, start=START, stop=STOP)
df_multi_train_y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, ba6695d0c309 to 4f5343ecd720
Columns: 23418 entries, ENSG00000121410 to ENSG00000074755
dtypes: float32(23418)
memory usage: 893.4+ MB


In [9]:
multi_gene_id_mean = df_multi_train_y.mean()
multi_gene_id_mean

gene_id
ENSG00000121410    0.525649
ENSG00000268895    0.352912
ENSG00000175899    0.172957
ENSG00000245105    0.051324
ENSG00000166535    0.030551
                     ...   
ENSG00000070476    2.755820
ENSG00000203995    0.017515
ENSG00000162378    1.199799
ENSG00000159840    1.044870
ENSG00000074755    2.094545
Length: 23418, dtype: float32

## Convert gene_id to int (to save memory)

In [10]:
cite_gene_id_mean.index

Index(['CD86', 'CD274', 'CD270', 'CD155', 'CD112', 'CD47', 'CD48', 'CD40',
       'CD154', 'CD52',
       ...
       'CD94', 'CD162', 'CD85j', 'CD23', 'CD328', 'HLA-E', 'CD82', 'CD101',
       'CD88', 'CD224'],
      dtype='object', name='gene_id', length=140)

In [11]:
multi_gene_id_mean.index

Index(['ENSG00000121410', 'ENSG00000268895', 'ENSG00000175899',
       'ENSG00000245105', 'ENSG00000166535', 'ENSG00000256661',
       'ENSG00000184389', 'ENSG00000128274', 'ENSG00000094914',
       'ENSG00000081760',
       ...
       'ENSG00000086827', 'ENSG00000174442', 'ENSG00000122952',
       'ENSG00000198205', 'ENSG00000198455', 'ENSG00000070476',
       'ENSG00000203995', 'ENSG00000162378', 'ENSG00000159840',
       'ENSG00000074755'],
      dtype='object', name='gene_id', length=23418)

In [12]:
_ = list(cite_gene_id_mean.index) + list(multi_gene_id_mean.index)
gene_id = pd.DataFrame(_, columns=['gene_id'])
gene_id

Unnamed: 0,gene_id
0,CD86
1,CD274
2,CD270
3,CD155
4,CD112
...,...
23553,ENSG00000070476
23554,ENSG00000203995
23555,ENSG00000162378
23556,ENSG00000159840


In [13]:
gene_id['gene_id_int'] = gene_id['gene_id'].apply(lambda x: int(x.replace('-', '').replace('.', '')[-8:],34)).astype(int)
gene_id['gene_id_int'].value_counts()

486954      1
9361326     1
49488671    1
56361970    1
56291592    1
           ..
49688382    1
45636815    1
45553405    1
53613037    1
9519835     1
Name: gene_id_int, Length: 23558, dtype: int64

# Create submit file

In [14]:
df_sample_submission = pd.read_csv(SUBMISSON, usecols=['row_id'])
df_sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65744180 entries, 0 to 65744179
Data columns (total 1 columns):
 #   Column  Dtype
---  ------  -----
 0   row_id  int64
dtypes: int64(1)
memory usage: 501.6 MB


In [15]:
df_evaluation = pd.read_csv(EVALUATION_IDS, usecols=['row_id', 'gene_id'])
df_evaluation['gene_id_int'] = df_evaluation['gene_id'].apply(lambda x: int(x.replace('-', '').replace('.', '')[-8:],34)).astype(int)
df_evaluation.drop(['gene_id'], axis=1, inplace=True)
df_evaluation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65744180 entries, 0 to 65744179
Data columns (total 2 columns):
 #   Column       Dtype
---  ------       -----
 0   row_id       int64
 1   gene_id_int  int64
dtypes: int64(2)
memory usage: 1003.2 MB


In [16]:
df_sample_submission = df_sample_submission.merge(df_evaluation, how='left', on='row_id')
df_sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65744180 entries, 0 to 65744179
Data columns (total 2 columns):
 #   Column       Dtype
---  ------       -----
 0   row_id       int64
 1   gene_id_int  int64
dtypes: int64(2)
memory usage: 1.5 GB


In [17]:
cite_gene_id_mean = pd.DataFrame(cite_gene_id_mean, columns=['target']).reset_index()
cite_gene_id_mean['gene_id_int'] = cite_gene_id_mean['gene_id'].apply(lambda x: int(x.replace('-', '').replace('.', '')[-8:],34)).astype(int)
cite_gene_id_mean.drop(['gene_id'], axis=1, inplace=True)

In [18]:
multi_gene_id_mean = pd.DataFrame(multi_gene_id_mean, columns=['target']).reset_index()
multi_gene_id_mean['gene_id_int'] = multi_gene_id_mean['gene_id'].apply(lambda x: int(x.replace('-', '').replace('.', '')[-8:],34)).astype(int)
multi_gene_id_mean.drop(['gene_id'], axis=1, inplace=True)

In [19]:
cite_multi_gene_id_mean = pd.concat([cite_gene_id_mean, multi_gene_id_mean])
cite_multi_gene_id_mean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23558 entries, 0 to 23417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   target       23558 non-null  float32
 1   gene_id_int  23558 non-null  int64  
dtypes: float32(1), int64(1)
memory usage: 460.1 KB


In [20]:
df_sample_submission = df_sample_submission.merge(cite_multi_gene_id_mean, how='left', on='gene_id_int')
df_sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65744180 entries, 0 to 65744179
Data columns (total 3 columns):
 #   Column       Dtype  
---  ------       -----  
 0   row_id       int64  
 1   gene_id_int  int64  
 2   target       float32
dtypes: float32(1), int64(2)
memory usage: 1.7 GB


In [21]:
df_sample_submission[['row_id', 'target']].to_csv('submission.csv', index=False)