In [7]:
import pandas as pd
import numpy as np
import pickle
import scipy
import gc
from sklearn.decomposition import PCA, TruncatedSVD,KernelPCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from umap import UMAP
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import numba # for speed up

## Dimension Reduction

In [3]:
train = pd.read_hdf("../../../input/open-problems-raw-counts/train_cite_inputs_raw.h5")
train_columns = train.columns
train_indexes = train.index
train.shape

(70988, 22085)

In [4]:
all_zeros_features = train.columns[train.sum()==0].to_list()
none_zeros_features = [i for i in train.columns if i not in all_zeros_features]
len(all_zeros_features)

484

In [5]:
test = pd.read_hdf("../../../input/open-problems-raw-counts/test_cite_inputs_raw.h5")
test_indexes = test.index
test.shape

(48203, 22085)

In [6]:
train = train[none_zeros_features]
test = test[none_zeros_features]
train = scipy.sparse.csr_matrix(train)
test = scipy.sparse.csr_matrix(test)
all = scipy.sparse.vstack([train,test])
del train,test
gc.collect()
all.shape

(119191, 21601)

In [22]:
test_ori = pd.read_hdf("../../../input/open-problems-multimodal/test_cite_inputs.h5")
test_indexes_ori = test_ori.index
del test_ori
gc.collect()

1528

In [8]:
all_log = np.log1p(all)
all_indexes = train_indexes.to_list()+test_indexes.to_list()

### Tsvd

In [8]:
%%time
pure_tsvd = TruncatedSVD(n_components=128, random_state=42)
train_tsvd = pure_tsvd.fit_transform(all_log)
print(pure_tsvd.explained_variance_ratio_.sum())

0.32744268
CPU times: total: 1min 34s
Wall time: 3min 12s


In [9]:
train_tsvd = pd.DataFrame(train_tsvd,index = all_indexes)
test = train_tsvd.iloc[70988:]
test = test.drop_duplicates()
test = test.reindex(test_indexes_ori)
test = test.fillna(0)
test.shape

(48663, 128)

In [10]:
np.savez("cite_train_tsvd.npz", train_tsvd.iloc[:70988])
np.savez("cite_test_tsvd.npz",test)

In [11]:
del train_tsvd,pure_tsvd,test
gc.collect()

0

### UMAP

In [8]:
%%time
umap = UMAP(n_neighbors = 16,n_components=128, random_state=42,verbose = True,low_memory = True,n_jobs = -1)
train_umap = umap.fit_transform(all_log.toarray())

UMAP(n_components=128, n_neighbors=16, random_state=42, verbose=True)
Thu Nov 17 19:02:07 2022 Construct fuzzy simplicial set
Thu Nov 17 19:02:16 2022 Finding Nearest Neighbors
Thu Nov 17 19:02:19 2022 Building RP forest with 22 trees
Thu Nov 17 19:05:03 2022 NN descent for 17 iterations
	 1  /  17
	 2  /  17
	 3  /  17
	 4  /  17
	 5  /  17
	 6  /  17
	Stopping threshold met -- exiting after 6 iterations
Thu Nov 17 19:10:56 2022 Finished Nearest Neighbor Search
Thu Nov 17 19:11:39 2022 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

Thu Nov 17 19:19:33 2022 Finished embedding
CPU times: total: 36min 16s
Wall time: 17min 56s


In [9]:
train_umap = pd.DataFrame(train_umap,index = all_indexes)
test = train_umap.iloc[70988:]
test = test.drop_duplicates()
test = test.reindex(test_indexes_ori)
test = test.fillna(0)
test.shape

(48663, 128)

In [10]:
np.savez("cite_train_umap.npz", train_umap.iloc[:70988])
np.savez("cite_test_umap.npz",test)

In [None]:
del train_umap,umap,test
gc.collect()

### Novel's method [link](https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/blob/dc7bd58dacbe804dcc7be047531d795b1b04741e/src/predict_modality/methods/novel/resources/helper_functions.py)

In [8]:
def tfidf(X):
    idf = X.shape[0] / X.sum(axis=0)
    if scipy.sparse.issparse(X):
        tf = X.multiply(1 / X.sum(axis=1))
        return tf.multiply(idf)
    else:
        tf = X / X.sum(axis=1, keepdims=True)
        return tf * idf

In [9]:
all_novel = tfidf(all)
all_novel = all_novel.tocsr()
all_novel = np.log1p(all_novel * 1e4)

In [10]:
%%time
tsvd = TruncatedSVD(n_components=128, random_state=42)
train_novel = tsvd.fit_transform(all_novel)
print(tsvd.explained_variance_ratio_.sum())

0.08021713892446569
CPU times: total: 3min 52s
Wall time: 10min 11s


In [11]:
train_novel = pd.DataFrame(train_novel,index = all_indexes)
test = train_novel.iloc[70988:]
test = test.drop_duplicates()
test = test.reindex(test_indexes_ori)
test = test.fillna(0)
test.shape

(48663, 128)

In [13]:
np.savez("cite_train_novel.npz", train_novel.iloc[:70988])
np.savez("cite_test_novel.npz",test)

In [14]:
del train_novel,tsvd,test
gc.collect()

520

## Featurn impostance

### name importance

In [3]:
target = pd.read_hdf("../../../input/open-problems-multimodal/train_cite_targets.h5")
target_columns = target.columns
target.shape

(70988, 140)

In [15]:
import mygene
mg = mygene.MyGeneInfo()

In [18]:
train_columns = [(i,i.split("_")) for i in train_columns]
train_columns = [(i,j[0],j[1]) for i,j in train_columns]

In [20]:
gene_quary =[i for _,i,_ in train_columns]
gene_quary[:5]

['ENSG00000121410',
 'ENSG00000268895',
 'ENSG00000175899',
 'ENSG00000245105',
 'ENSG00000166535']

In [21]:
gene_res = mg.getgenes(gene_quary,as_dataframe = True)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-22085...done.


In [22]:
train_columns = pd.DataFrame(train_columns)
train_columns.columns = ["colums_id","query","aias"]
train_columns.head(1)

Unnamed: 0,colums_id,query,aias
0,ENSG00000121410_A1BG,ENSG00000121410,A1BG


In [23]:
train_columns_new = pd.merge(train_columns,gene_res.reset_index()[["query","alias"]],on = "query",how = "inner")
train_columns_new.head(1)

Unnamed: 0,colums_id,query,aias,alias
0,ENSG00000121410_A1BG,ENSG00000121410,A1BG,"[A1B, ABG, GAB, HYST2477]"


In [25]:
import warnings 
warnings.filterwarnings("ignore")
gene_res_df = pd.DataFrame()
no_alias = pd.DataFrame()
for i in enumerate(tqdm(train_columns_new.iterrows())):
    _,i = i
    _,i = i

    if not i["alias"] is np.nan:
        if isinstance(i["alias"],str):
            temp_df = pd.DataFrame([[i["colums_id"],i["alias"]]],columns= ["id","name"])
            gene_res_df = gene_res_df.append(temp_df)
        else:
            for j in i["alias"]:
                temp_df = pd.DataFrame([[i["colums_id"],j]],columns= ["id","name"])
                gene_res_df = gene_res_df.append(temp_df)
    else:
        no_alias = no_alias.append(pd.DataFrame([[id,i["query"]]]))
    temp_df = pd.DataFrame([[i["colums_id"],i["aias"]]],columns= ["id","name"])
    gene_res_df = gene_res_df.append(temp_df)

0it [00:00, ?it/s]

In [31]:
gene_res_df_match = gene_res_df[gene_res_df.name.isin(target_columns)]
gene_res_df_match.head(1)

Unnamed: 0,id,name
0,ENSG00000166825_ANPEP,CD13


Adding online important features from [this notebok](https://www.kaggle.com/code/alekse1pakhomov/mmscel-magic-features-load-example-cell-cycle) 

In [28]:
cite_cols_important={
            'CD86': ['ENSG00000114013_CD86'],
             'CD274': ['ENSG00000120217_CD274'],
             'CD270': ['ENSG00000157873_TNFRSF14'],
             'CD155': ['ENSG00000073008_PVR'],
             'CD112': ['ENSG00000130202_NECTIN2'],
             'CD47': ['ENSG00000196776_CD47'],
             'CD48': ['ENSG00000117091_CD48'],
             'CD40': ['ENSG00000101017_CD40'],
             'CD154': ['ENSG00000102245_CD40LG'],
             'CD52': ['ENSG00000169442_CD52'],
             'CD3': ['ENSG00000167286_CD3D'],
             'CD8': [],
             'CD56': ['ENSG00000149294_NCAM1'],
             'CD19': ['ENSG00000177455_CD19'],
             'CD33': ['ENSG00000105383_CD33'],
             'CD11c': ['ENSG00000140678_ITGAX'],
             'HLA-A-B-C': ['ENSG00000204525_HLA-C',
              'ENSG00000206503_HLA-A',
              'ENSG00000234745_HLA-B'],
             'CD45RA': ['ENSG00000081237_PTPRC'],
             'CD123': ['ENSG00000185291_IL3RA'],
             'CD7': ['ENSG00000173762_CD7'],
             'CD105': ['ENSG00000106991_ENG'],
             'CD49f': ['ENSG00000091409_ITGA6'],
             'CD194': ['ENSG00000183813_CCR4'],
             'CD4': ['ENSG00000010610_CD4'],
             'CD44': ['ENSG00000026508_CD44'],
             'CD14': ['ENSG00000170458_CD14'],
             'CD16': [],
             'CD25': ['ENSG00000134460_IL2RA'],
             'CD45RO': ['ENSG00000081237_PTPRC'],
             'CD279': [],
             'TIGIT': [],
             'Mouse-IgG1': [],
             'Mouse-IgG2a': [],
             'Mouse-IgG2b': [],
             'Rat-IgG2b': [],
             'CD20': ['ENSG00000156738_MS4A1'],
             'CD335': ['ENSG00000189430_NCR1'],
             'CD31': ['ENSG00000261371_PECAM1'],
             'Podoplanin': [],
             'CD146': ['ENSG00000076706_MCAM'],
             'IgM': ['ENSG00000211899_IGHM'],
             'CD5': [],
             'CD195': ['ENSG00000160791_CCR5'],
             'CD32': ['ENSG00000143226_FCGR2A'],
             'CD196': [],
             'CD185': ['ENSG00000160683_CXCR5'],
             'CD103': ['ENSG00000083457_ITGAE'],
             'CD69': ['ENSG00000110848_CD69'],
             'CD62L': ['ENSG00000188404_SELL'],
             'CD161': ['ENSG00000111796_KLRB1'],
             'CD152': [],
             'CD223': ['ENSG00000089692_LAG3'],
             'KLRG1': ['ENSG00000139187_KLRG1'],
             'CD27': ['ENSG00000139193_CD27'],
             'CD107a': ['ENSG00000185896_LAMP1'],
             'CD95': ['ENSG00000026103_FAS'],
             'CD134': ['ENSG00000186827_TNFRSF4'],
             'HLA-DR': ['ENSG00000204287_HLA-DRA'],
             'CD1c': ['ENSG00000158481_CD1C'],
             'CD11b': ['ENSG00000169896_ITGAM'],
             'CD64': ['ENSG00000150337_FCGR1A'],
             'CD141': ['ENSG00000178726_THBD'],
             'CD1d': ['ENSG00000158473_CD1D'],
             'CD314': [],
             'CD35': ['ENSG00000203710_CR1'],
             'CD57': [],
             'CD272': [],
             'CD278': ['ENSG00000163600_ICOS'],
             'CD58': ['ENSG00000116815_CD58'],
             'CD39': ['ENSG00000138185_ENTPD1'],
             'CX3CR1': ['ENSG00000168329_CX3CR1'],
             'CD24': ['ENSG00000272398_CD24'],
             'CD21': ['ENSG00000117322_CR2'],
             'CD11a': ['ENSG00000005844_ITGAL'],
             'CD79b': ['ENSG00000007312_CD79B'],
             'CD244': ['ENSG00000122223_CD244'],
             'CD169': [],
             'integrinB7': ['ENSG00000139626_ITGB7'],
             'CD268': ['ENSG00000159958_TNFRSF13C'],
             'CD42b': ['ENSG00000185245_GP1BA'],
             'CD54': ['ENSG00000090339_ICAM1'],
             'CD62P': ['ENSG00000174175_SELP'],
             'CD119': ['ENSG00000027697_IFNGR1'],
             'TCR': [],
             'Rat-IgG1': [],
             'Rat-IgG2a': [],
             'CD192': ['ENSG00000121807_CCR2'],
             'CD122': ['ENSG00000100385_IL2RB'],
             'FceRIa': ['ENSG00000179639_FCER1A'],
             'CD41': ['ENSG00000005961_ITGA2B'],
             'CD137': ['ENSG00000049249_TNFRSF9'],
             'CD163': ['ENSG00000177575_CD163'],
             'CD83': ['ENSG00000112149_CD83'],
             'CD124': ['ENSG00000077238_IL4R'],
             'CD13': ['ENSG00000166825_ANPEP'],
             'CD2': ['ENSG00000116824_CD2'],
             'CD226': ['ENSG00000150637_CD226'],
             'CD29': ['ENSG00000150093_ITGB1'],
             'CD303': ['ENSG00000198178_CLEC4C'],
             'CD49b': ['ENSG00000164171_ITGA2'],
             'CD81': ['ENSG00000110651_CD81'],
             'IgD': ['ENSG00000211898_IGHD'],
             'CD18': ['ENSG00000160255_ITGB2'],
             'CD28': [],
             'CD38': ['ENSG00000004468_CD38'],
             'CD127': ['ENSG00000168685_IL7R'],
             'CD45': ['ENSG00000081237_PTPRC'],
             'CD22': ['ENSG00000012124_CD22'],
             'CD71': ['ENSG00000072274_TFRC'],
             'CD26': ['ENSG00000197635_DPP4'],
             'CD115': ['ENSG00000182578_CSF1R'],
             'CD63': ['ENSG00000135404_CD63'],
             'CD304': ['ENSG00000099250_NRP1'],
             'CD36': ['ENSG00000135218_CD36'],
             'CD172a': ['ENSG00000198053_SIRPA'],
             'CD72': ['ENSG00000137101_CD72'],
             'CD158': [],
             'CD93': ['ENSG00000125810_CD93'],
             'CD49a': ['ENSG00000213949_ITGA1'],
             'CD49d': ['ENSG00000115232_ITGA4'],
             'CD73': [],
             'CD9': ['ENSG00000010278_CD9'],
             'TCRVa7.2': [],
             'TCRVd2': [],
             'LOX-1': ['ENSG00000173391_OLR1'],
             'CD158b': [],
             'CD158e1': [],
             'CD142': ['ENSG00000117525_F3'],
             'CD319': ['ENSG00000026751_SLAMF7'],
             'CD352': ['ENSG00000162739_SLAMF6'],
             'CD94': ['ENSG00000134539_KLRD1'],
             'CD162': ['ENSG00000110876_SELPLG'],
             'CD85j': ['ENSG00000104972_LILRB1'],
             'CD23': ['ENSG00000104921_FCER2'],
             'CD328': ['ENSG00000168995_SIGLEC7'],
             'HLA-E': ['ENSG00000204592_HLA-E'],
             'CD82': ['ENSG00000085117_CD82'],
             'CD101': ['ENSG00000134256_CD101'],
             'CD88': ['ENSG00000197405_C5AR1'],
             'CD224': ['ENSG00000100031_GGT1']}

In [29]:
cite_cols_important  = [[j,i[0]] for i in cite_cols_important.items() for j in i[1] ]
len(cite_cols_important)

116

In [33]:
cite_cols_important = pd.DataFrame(cite_cols_important,columns= gene_res_df_match.columns )
cite_cols_important.head(1)

Unnamed: 0,id,name
0,ENSG00000114013_CD86,CD86


In [34]:
cite_cols_important = pd.concat([gene_res_df_match,cite_cols_important],axis = 0)

In [35]:
cite_cols_important = cite_cols_important.drop_duplicates()
cite_cols_important.shape

(119, 2)

In [36]:
important_columns = cite_cols_important["id"].to_list()
important_columns[:5]

['ENSG00000166825_ANPEP',
 'ENSG00000197405_C5AR1',
 'ENSG00000121807_CCR2',
 'ENSG00000183813_CCR4',
 'ENSG00000160791_CCR5']

In [47]:
with open("./name_important_cols.pkl","wb") as f:
    pickle.dump(important_columns,f)

### corr importance

In [13]:
train = pd.read_hdf("../../../input/open-problems-raw-counts/train_cite_inputs_raw.h5")
train.shape

(70988, 22085)

In [41]:
# spead up correlation calculation
@numba.jit(nopython=True)
def calc_corr(train,target):
    corr = np.zeros((train.shape[1],target.shape[1]),dtype=np.float32)
    for tr in range(train.shape[1]):
        for ta in range(target.shape[1]):
            a = train[:,tr]
            b = target[:,ta]
            corr[tr,ta] += np.corrcoef(a,b)[0,1]
    return corr

In [42]:
%%time
corr = calc_corr(train.values,target.values)

CPU times: total: 7min 28s
Wall time: 51min 11s


In [45]:
corr = pd.DataFrame(corr,index = train.columns.to_list())
corr = corr.abs()
corr.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
ENSG00000121410_A1BG,0.002002,0.00257,0.000948,0.006793,0.020365,0.018518,0.001882,0.000184,0.002991,0.009233,...,0.004043,0.005055,0.00014,0.000159,0.000346,0.001771,0.00937,0.007576,0.010965,0.001367


In [46]:
list_to_choose = []
for i in range(140):
    list_to_choose += corr[i].sort_values(ascending=False)[:3].index.to_list()
list_to_choose = list(set(list_to_choose))
len(list_to_choose)

104

In [48]:
with open("./corr_important_cols.pkl","wb") as f:
    pickle.dump(list_to_choose,f)

### rf importance

In [4]:
train.shape,target.shape

((70988, 22085), (70988, 140))

In [7]:
train = train[none_zeros_features]
train.shape

(70988, 21601)

In [8]:
target = target.values
target -= target.mean(axis=1).reshape(-1, 1)
target /= target.std(axis=1).reshape(-1, 1)

In [9]:
meta = pd.read_csv("../input/open-problems-multimodal/metadata.csv",index_col = "cell_id")
meta = meta[meta.technology=="citeseq"]
meta_train = meta.reindex(train_indexes)
meta_train.shape

(70988, 4)

In [10]:
lbe = LabelEncoder()
train_cell_type = lbe.fit_transform(meta_train["cell_type"])
ohe = OneHotEncoder(sparse = False)
train_cell_type = ohe.fit_transform(train_cell_type.reshape(-1, 1))
train_cell_type.shape

(70988, 7)

In [11]:
train_cell_type_col = [f"cell_type_{i}" for i in range(7)]
train_cell_type = pd.DataFrame(train_cell_type,columns=train_cell_type_col,index=train.index)
train_cell_type.shape

(70988, 7)

In [12]:
train = pd.concat([train,train_cell_type],axis = 1)
train.shape

(70988, 21608)

In [13]:
rtr = RandomForestRegressor(n_jobs = -1,random_state =42,verbose=1)
rtr.fit(train,target)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 112.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 624.2min finished


RandomForestRegressor(n_jobs=-1, random_state=42, verbose=1)

In [14]:
fi = pd.DataFrame([rtr.feature_importances_],columns=train.columns).T
rf_important_features = fi.drop(train_cell_type_col)[0].sort_values(ascending=False)[:128].index.to_list()

In [15]:
with open("./rf_important_cols.pkl","wb") as f:
    pickle.dump(rf_important_features,f)

In [19]:
del rtr
gc.collect()

7

### get features

In [13]:
train.head(1)

gene_id,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
with open("./name_important_cols.pkl","rb") as f:
    name_important_cols = pickle.load(f)
    name_important_cols.remove('ENSG00000158481_CD1C') # those two are all zero columns
    name_important_cols.remove('ENSG00000173391_OLR1')

with open("./corr_important_cols.pkl","rb") as f:
    corr_important_cols = pickle.load(f)

with open("./rf_important_cols.pkl","rb") as f:
    rf_important_cols = pickle.load(f)

len(name_important_cols),len(corr_important_cols),len(rf_important_cols)

(117, 104, 128)

In [17]:
important_cols = name_important_cols + corr_important_cols + rf_important_cols
len(important_cols)

349

In [18]:
train_imp = train[important_cols]
train_imp.head(1)

gene_id,ENSG00000166825_ANPEP,ENSG00000197405_C5AR1,ENSG00000121807_CCR2,ENSG00000183813_CCR4,ENSG00000160791_CCR5,ENSG00000134256_CD101,ENSG00000170458_CD14,ENSG00000177575_CD163,ENSG00000177455_CD19,ENSG00000116824_CD2,...,ENSG00000148303_RPL7A,ENSG00000143774_GUK1,ENSG00000125691_RPL23,ENSG00000134198_TSPAN2,ENSG00000197062_ZSCAN26,ENSG00000073008_PVR,ENSG00000166260_COX11,ENSG00000132763_MMACHC,ENSG00000231164_RPL7P56,ENSG00000197061_HIST1H4C
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,111.0,1.0,21.0,0.0,0.0,0.0,1.0,0.0,0.0,11.0


In [20]:
test = pd.read_hdf("../../../input/open-problems-raw-counts/test_cite_inputs_raw.h5")
test_indexes = test.index
test.shape

(48203, 22085)

In [21]:
test_imp = test[important_cols]
test.head(1)

gene_id,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
83d6659a6a32,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0,1.0,0.0,...,0.0,2.0,4.0,7.0,0.0,0.0,0.0,0.0,3.0,0.0


In [23]:
test_imp = pd.DataFrame(test_imp,index = test_indexes)
test_imp = test_imp.drop_duplicates()
test_imp = test_imp.reindex(test_indexes_ori)
test_imp = test_imp.fillna(0)
test_imp.shape

(48663, 349)

In [24]:
np.savez("cite_train_imp.npz", train_imp)
np.savez("cite_test_imp.npz",test_imp)

## ALL

In [2]:
import numpy as np

In [28]:
train_tsvd = np.load("cite_train_tsvd.npz")["arr_0"]
train_umap = np.load("cite_train_umap.npz")["arr_0"]
train_novel = np.load("cite_train_novel.npz")["arr_0"]
train_imp = np.load("cite_train_imp.npz")["arr_0"]

train_all  = np.concatenate([train_tsvd, train_umap, train_novel, train_imp],axis = 1)

In [29]:
test_tsvd = np.load("cite_test_tsvd.npz")["arr_0"]
test_umap = np.load("cite_test_umap.npz")["arr_0"]
test_novel = np.load("cite_test_novel.npz")["arr_0"]
test_imp = np.load("cite_test_imp.npz")["arr_0"]

test_all  = np.concatenate([test_tsvd, test_umap, test_novel, test_imp],axis = 1)

In [30]:
np.savez("cite_train_all.npz", train_all)
np.savez("cite_test_all.npz",test_all)
train_all.shape,test_all.shape

((70988, 733), (48663, 733))