In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
from sklearn.linear_model import LinearRegression
import joblib
import sys

def train_model(tile_num, output_model_fn=None):
    #create datapaths 
    Data_path = "/u/home/m/mardren/scratch/SequenceML"
    gkm_path = "%s/features.gkSVM.HepG2.tsv.gz"%Data_path
    conv_path       = "%s/features.dragoNN_ConvModel.HepG2.SV40P.Rep1.tsv.gz"%Data_path
    deepfact_path       = "%s/features.dragoNN_DeepFactorizedModel.HepG2.minP.Rep1.tsv.gz"%Data_path
    kmer_path = "%s/6mer_prediction.pkl"%Data_path
    sharpr_path = "%s/6mer_label.pkl"%Data_path


    #open data from csv into dataframes
    gkm = pd.read_csv(gkm_path, header = 0, index_col = 0, sep = '\t')
    conv  = pd.read_csv(conv_path, header = 0, index_col = 0, sep = '\t')
    deepfact = pd.read_csv(deepfact_path, header = 0, index_col = 0, sep = '\t')
    kmer = pd.read_pickle('%s'%(kmer_path))
    sharpr = pd.read_pickle('%s'%(sharpr_path))
    gkm_15 = gkm[['feat_gksvm_%s'%tile_num]]
    gkm_15 = gkm_15.rename(columns = {'feat_gksvm_%s'%tile_num:'gkm_%s'%tile_num})
    conv_15 = conv[[tile_num]]
    conv_15 = conv_15.rename(columns = {tile_num:'conv_%s'%tile_num})
    deepfact_15 = deepfact[[tile_num]]
    deepfact_15 = deepfact_15.rename(columns = {tile_num:'deepfact_%s'%tile_num})
    tile_num = int(tile_num)
    kmer_15 = kmer[[tile_num]]
    kmer_15 = kmer_15.rename(columns = {tile_num:'kmer_%s'%tile_num})
    sharpr_15 = sharpr[[tile_num]]
    sharpr_15 = sharpr_15.rename(columns = {tile_num:'sharpr_%s'%tile_num})
    
    #create data dataframe
    data = pd.concat([conv_15, gkm_15, deepfact_15, kmer_15, sharpr_15], axis=1)
    
    #reset index to be able to access region_id
    data = data.reset_index()
    
    #split region_id to create chrom column
    data['chrom'] = data['region_id'].str.split('_').str[3]

    #setting aside chr1 and chr2 for testing data
    testing_data_chr1 = data[data["chrom"] == 'chr1']
    testing_data_chr2 = data[data["chrom"] == 'chr2']
    test_set = pd.concat([testing_data_chr1, testing_data_chr2])
    #removing chr1 and chr2 from training data
    training_data_nochr1 = data[data["chrom"] != 'chr1']
    train_set = training_data_nochr1[training_data_nochr1["chrom"] != 'chr2']
    #dropping the chrom column
    test_set = test_set.drop(columns=['chrom'])
    train_set = train_set.drop(columns=['chrom'])
    data = data.drop(columns=['chrom'])

    #format the training data
    labels = train_set['prediction'].values
    train_set = train_set.drop(columns=['region_id','prediction'])
    encodings = train_set.values.tolist()
    encodings = np.asarray(encodings)
    encodings_df = pd.DataFrame(encodings)
    encodings_df.fillna(encodings_df.mean(), inplace=True)
    encodings = encodings_df.to_numpy()
    labels = np.reshape(labels,(12888,))


    #format the testing data
    test_labels = test_set['prediction'].values
    test_set = test_set.drop(columns=['region_id','prediction'])
    test_encodings = test_set.values.tolist()
    test_encodings = np.asarray(test_encodings)
    test_encodings_df = pd.DataFrame(test_encodings)
    test_encodings_df.fillna(test_encodings_df.mean(), inplace=True)
    test_encodings = test_encodings_df.to_numpy()
    test_labels = np.reshape(test_labels,(2832,))
    
    #train a linear regressor on the training dataset. 
    from sklearn.linear_model import LinearRegression

    sequence_regressor = LinearRegression()
    sequence_regressor.fit(encodings, labels)
    
    if output_model_fn is None:
        output_model_fn = f"model.tile_{tile_num}.model"  
    joblib.dump(sequence_regressor, output_model_fn)
    
if __name__ == "__main__":
    # take tile_num in as a parameter
    #tile_num = sys.argv[1]
    tile_num = '15'
    train_model(tile_num)


KeyError: 'prediction'