In [None]:
!pip install -q tables

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
print("Done")

# Constants

In [None]:
INPUT_DIR = "../input/open-problems-multimodal"

EVALUATION_DIR = os.path.join(INPUT_DIR, "evaluation_ids.csv")
METADATA_DIR = os.path.join(INPUT_DIR, "metadata.csv")
SUBMISSION_DIR = os.path.join(INPUT_DIR, "sample_submission.csv")

MULTIOME_TRAIN_INPUTS = os.path.join(INPUT_DIR,"train_multi_inputs.h5")
MULTIOME_TRAIN_TARGETS = os.path.join(INPUT_DIR,"train_multi_targets.h5")
MULTIOME_TEST_INPUTS = os.path.join(INPUT_DIR,"test_multi_inputs.h5")
CITE_TRAIN_INPUTS = os.path.join(INPUT_DIR,"train_cite_inputs.h5")
CITE_TRAIN_TARGETS = os.path.join(INPUT_DIR,"train_cite_targets.h5")
CITE_TEST_INPUTS = os.path.join(INPUT_DIR,"test_cite_inputs.h5")
SUBMISSION_PATH = os.path.join(INPUT_DIR,"sample_submission.csv")
EVALUATION_IDS = os.path.join(INPUT_DIR,"evaluation_ids.csv")

START = int(1e4)
STOP = START+10000

ROW_ID = "row_id"
TARGET = "target"
GENE_ID_INT = "gene_id_int"
GENE_ID = "gene_id"

print("Done")

# Functions 

In [None]:
def data_description(df):
    print("Data description")
    print(f"Total number of records {df.shape[0]}")
    print(f'number of features {df.shape[1]}\n\n')
    columns = df.columns
    data_type = []
    
    # Get the datatype of features
    for col in df.columns:
        data_type.append(df[col].dtype)
        
    n_uni = df.nunique()
    # Number of NaN values
    n_miss = df.isna().sum()
    
    names = list(zip(columns, data_type, n_uni, n_miss))
    variable_desc = pd.DataFrame(names, columns=["Name","Type","Unique levels","Missing"])
    print(variable_desc)

In [None]:
def transform_df(df, column, int_col, drop_col):
    df = pd.DataFrame(df, columns = [column]).reset_index()
    df[int_col] = df[drop_col].apply(lambda x: int(x.replace("-","").replace(".","")[-8:],34)).astype(int)
    df.drop([drop_col], axis = 1, inplace = True)
    return df

# Data preparation

In [None]:
train_cite_targ = pd.read_hdf(CITE_TRAIN_TARGETS)
metadata = pd.read_csv(METADATA_DIR)

train_multi_targ = pd.read_hdf(MULTIOME_TRAIN_TARGETS, start=START, stop=STOP)
print("Done")

In [None]:
metadata.head()

In [None]:
train_cite_targ.head()

In [None]:
data_description(train_cite_targ)
print("Done")

In [None]:
train_multi_targ.head()

In [None]:
data_description(train_multi_targ)

In [None]:
multi_gene_id_mean = train_multi_targ.mean()
multi_gene_id_mean


In [None]:
cite_gene_id_mean = train_cite_targ.mean()
cite_gene_id_mean

In [None]:
cite_gene_id_mean.index

In [None]:
multi_gene_id_mean.index

In [None]:
gene_id_mean = list(cite_gene_id_mean.index) + list(multi_gene_id_mean.index)
gene_id = pd.DataFrame(gene_id_mean, columns = [GENE_ID])
gene_id.head()

In [None]:
gene_id[GENE_ID_INT] = gene_id[GENE_ID].apply(lambda x : int(x.replace("-", "").replace(".","")[-8:], 34)).astype(int)
gene_id.head()

In [None]:
data_description(gene_id)

# Submission

In [None]:
submission = pd.read_csv(SUBMISSION_PATH, usecols = [ROW_ID])
data_description(submission)

In [None]:
evaluation = pd.read_csv(EVALUATION_IDS, usecols=[ROW_ID, GENE_ID])
evaluation[GENE_ID_INT] = evaluation[GENE_ID].apply(lambda x: int(x.replace('-', '').replace('.', '')[-8:],34)).astype(int)
evaluation.drop([GENE_ID], axis=1, inplace=True)
data_description(evaluation)

In [None]:
evaluation.head()

In [None]:
submission = submission.merge(evaluation, how = "left", on = ROW_ID)
data_description(submission)

In [None]:
cite_gene_id_mean = transform_df(cite_gene_id_mean, TARGET, GENE_ID_INT, GENE_ID)
cite_gene_id_mean.head()

In [None]:
multi_gene_id_mean = transform_df(multi_gene_id_mean, TARGET, GENE_ID_INT, GENE_ID)
multi_gene_id_mean.head()

In [None]:
completed_gene_id_mean = pd.concat([cite_gene_id_mean, multi_gene_id_mean])
data_description(completed_gene_id_mean)

In [None]:
submission = submission.merge(completed_gene_id_mean, how = "left", on = GENE_ID_INT)
data_description(submission)

In [None]:
df_sample_submission[[ROW_ID, TARGET]].to_csv('submission.csv', index=False)