#**DOWNLOADING THE DATA**
Here we are installing kaggle, creating a temp location for our kaggle.json api key to sit in, and then getting the download command from the kaggle competition site in order to retrieve the competition data.

In [None]:
pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! mkdir ~/.kaggle

In [None]:
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c open-problems-multimodal

Downloading open-problems-multimodal.zip to /content
100% 18.1G/18.1G [02:19<00:00, 164MB/s]
100% 18.1G/18.1G [02:19<00:00, 139MB/s]


In [None]:
! unzip open-problems-multimodal.zip

Archive:  open-problems-multimodal.zip
  inflating: evaluation_ids.csv      
  inflating: metadata.csv            
  inflating: metadata_cite_day_2_donor_27678.csv  
  inflating: sample_submission.csv   
  inflating: test_cite_inputs.h5     
  inflating: test_cite_inputs_day_2_donor_27678.h5  
  inflating: test_multi_inputs.h5    
  inflating: train_cite_inputs.h5    
  inflating: train_cite_targets.h5   
  inflating: train_multi_inputs.h5   
  inflating: train_multi_targets.h5  


#**LOAD TRAINING DATA**

In [1]:
import os, gc, pickle
import numpy as np
import pandas as pd
import scipy
import scipy.sparse
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.decomposition import PCA

In [2]:
# init training paths
train_multi_inputs = 'train_multi_inputs.h5'
train_multi_targets = 'train_multi_targets.h5'

In [3]:
import matplotlib as plt
from matplotlib.ticker import MaxNLocator
from sklearn.base import BaseEstimator, TransformerMixin
class trainTransformer(BaseEstimator, TransformerMixin):
    # Set the range of columns to use
    columns_to_use = slice(10000, 14000)

    # Define the transform method
    def transform(self, X):
        # Select the columns to use
        X = X[:, trainTransformer.columns_to_use]

        # Apply the PCA transformation
        X = self.pca.transform(X)

        return X

    # Define the fit_transform method
    def fit_transform(self, X):
        # Select the columns to use
        X = X[:, trainTransformer.columns_to_use]

        # Fit the PCA transformation and plot the explained variance
        self.pca = PCA(n_components=4, copy=False, random_state=1)
        X = self.pca.fit_transform(X)

        # free up memory
        gc.collect()

        return X


In [4]:
# Use the transformer class to reconstruct the data with PCA reduction
# This allows us to use the important details of the table without taking up as much space
# load train multi inputs file into a pd.dataframe
multi_train_x = pd.read_hdf(train_multi_inputs,start=0,stop=6000)
transformer = trainTransformer()
multi_train_x = transformer.fit_transform(multi_train_x.values)

In [5]:
# stop the read at 6000 as it will take to much RAM to read the whole thing which will cause the program to crash
multi_train_y = pd.read_hdf(train_multi_targets,start=0,stop=6000)
y_cols = multi_train_y.columns
multi_train_y = multi_train_y.values
print(multi_train_y.shape)
print(y_cols)

(6000, 23418)
Index(['ENSG00000121410', 'ENSG00000268895', 'ENSG00000175899',
       'ENSG00000245105', 'ENSG00000166535', 'ENSG00000256661',
       'ENSG00000184389', 'ENSG00000128274', 'ENSG00000094914',
       'ENSG00000081760',
       ...
       'ENSG00000086827', 'ENSG00000174442', 'ENSG00000122952',
       'ENSG00000198205', 'ENSG00000198455', 'ENSG00000070476',
       'ENSG00000203995', 'ENSG00000162378', 'ENSG00000159840',
       'ENSG00000074755'],
      dtype='object', name='gene_id', length=23418)


In [6]:
print(multi_train_x)

[[-5.2843988e-01  2.2742115e-03  2.9596677e+00  5.0110835e-01]
 [ 6.1474204e-01 -1.0237789e+00  4.5415360e-01 -8.8921183e-01]
 [ 1.7157254e-01 -1.1258169e+00  1.9299679e+00 -6.9009703e-01]
 ...
 [ 4.8910224e-01 -4.4246697e-01 -1.3574443e+00  2.0356762e+00]
 [-4.2812544e-01 -5.3548181e-01 -8.7892823e-03 -9.9996740e-01]
 [ 5.3994048e-01  1.4954154e+00 -9.9652505e-01  5.5327171e-01]]


#Create Lasso Regression Model
Fit training data into lasso model and then use gc.collect() to clear out any memory that is unnecessarily taking up space. We have to keep the amount of RAM we are using in mind the whole time during this project.

Let's test out a dummyregression or ridge regression model in the future if the plan with the sparse matrix doesn't work out.

In [7]:
# get rid of unnecessary items in memory, create lasso model and then delete the training data variables to help with RAM
gc.collect()
model = Lasso(copy_X=False)
model.fit(multi_train_x, multi_train_y)
del multi_train_x, multi_train_y
_ = gc.collect()

  model = cd_fast.enet_coordinate_descent(


In [8]:
eval_ids = pd.read_csv("evaluation_ids.csv", index_col = 'row_id')
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())
display(eval_ids)

cell_id_set = set(eval_ids.cell_id)

y_cols = pd.CategoricalIndex(y_cols, dtype = eval_ids.gene_id.dtype, name = 'gene_id')

Unnamed: 0_level_0,cell_id,gene_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,c2150f55becb,CD86
1,c2150f55becb,CD274
2,c2150f55becb,CD270
3,c2150f55becb,CD155
4,c2150f55becb,CD112
...,...,...
65744175,2c53aa67933d,ENSG00000134419
65744176,2c53aa67933d,ENSG00000186862
65744177,2c53aa67933d,ENSG00000170959
65744178,2c53aa67933d,ENSG00000107874


#***BEGIN CREATING SUBMISSION***
This submission file is only half complete as it needs to be paired with the data from the CITEseq. For the moment, the multiome data is all that is in this submission.

In [9]:
submission = pd.Series(name = 'target', index = pd.MultiIndex.from_frame(eval_ids), dtype = np.float32)
print(submission)

cell_id       gene_id        
c2150f55becb  CD86              NaN
              CD274             NaN
              CD270             NaN
              CD155             NaN
              CD112             NaN
                                 ..
2c53aa67933d  ENSG00000134419   NaN
              ENSG00000186862   NaN
              ENSG00000170959   NaN
              ENSG00000107874   NaN
              ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32


#***COMPUTE PREDICTIONS***

In [10]:
# potentially use this instead
import pandas as pd

def test_data_generator(chunksize):
    start = 0
    while True:
        # Read the next chunk of data and select the needed rows
        multi_test_x = pd.read_hdf('test_multi_inputs.h5', start=start, stop=start+chunksize)
        needed_row_mask = multi_test_x.index.isin(cell_id_set)
        multi_test_x = multi_test_x.loc[needed_row_mask]

        # Keep the index for later
        multi_test_index = multi_test_x.index

        # Transform the data and yield the predictions
        multi_test_x = multi_test_x.values
        multi_test_x = transformer.transform(multi_test_x)
        test_pred = model.predict(multi_test_x)

        # Print the index and predictions for troubleshooting
        print("multi_test_index:", multi_test_index)
        print("test_pred:", test_pred)

        yield (multi_test_index, test_pred)

        # Check if there are more chunks to process
        if len(multi_test_x) < chunksize: break
        start += chunksize

# Process the test data in chunks using a generator function
total_rows = 0
for multi_test_index, test_pred in test_data_generator(chunksize=5000):
    # Convert the predictions to a dataframe
    multi_test_x = pd.DataFrame(test_pred, index=pd.CategoricalIndex(multi_test_index, dtype=eval_ids.cell_id.dtype, name='cell_id'), columns=y_cols)

    # Fill the predictions into the submission series row by row
    for i, (index, row) in enumerate(multi_test_x.iterrows()):
        row = row.reindex(eval_ids.gene_id[eval_ids.cell_id == index])
        submission.loc[index] = row.values
      
    print("na: ",submission.isna().sum())
    print('Length of submission series:', len(submission))
    total_rows += len(multi_test_x)
    print(total_rows)

# Delete some variables to free up memory
del multi_test_x, multi_test_index



multi_test_index: Index(['632ae0df4dcd', '6e3df813cfcf', 'f761aff20d94', '4ad5ef1d14ed',
       '81bdc45c1480', 'e8652be37372', 'f16910c43d66', '5ca7a64be856',
       '725c7b3b6045', 'f6a90f4dcf1c',
       ...
       'ded9a2888c08', '698c3475dd57', 'edb9d9e1e9ea', 'ce544b085151',
       'd7ca46354f40', 'ce8eb202adfd', 'afcb77dfee2f', '0344dd004d4d',
       'b72fd76caa96', 'fc0b4f00c001'],
      dtype='object', name='cell_id', length=1533)
test_pred: [[0.7462242  0.27992386 0.16733848 ... 0.945643   1.3317629  1.726759  ]
 [0.7462242  0.27992386 0.16733848 ... 0.945643   1.3317629  1.726759  ]
 [0.7462242  0.27992386 0.16733848 ... 0.945643   1.3317629  1.726759  ]
 ...
 [0.7462242  0.27992386 0.16733848 ... 0.945643   1.3317629  1.726759  ]
 [0.7462242  0.27992386 0.16733848 ... 0.945643   1.3317629  1.726759  ]
 [0.7462242  0.27992386 0.16733848 ... 0.945643   1.3317629  1.726759  ]]
na:  60360284
Length of submission series: 65744180
1533


In [15]:
submission.reset_index(drop=True, inplace=True)
submission.index.name = 'row_id'

#Turn the file into a pickle file
It makes it easier to transport into the CITEseq project

In [13]:
with open("partial_submission_multi.pickle", 'wb') as f: pickle.dump(submission, f)