Unzip and parse json.gz file

In [None]:
import gzip
import pandas as pd
import numpy as np
import json

file_path = "dataset0.json.gz"

def load_json_gz_to_dataframe(file_path):
    data = []
    # unzip file
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        # parse line by line due to multiple json object in file
        for line in f:
            json_data = json.loads(line)
            # obtain key value pair of transcript and position
            for transcript, positions in json_data.items():
                # obtain key value pair of position and data
                for position, reads in positions.items():
                    # obtain key value pair of sequence and list
                    for seq,num in reads.items():
                        # assuming that majority of reads are labeled as it is.
                        data.append({
                            'transcript': transcript,
                            'position': int(position),
                            'sequence': seq,
                            "data" : num
                        })
    return pd.DataFrame(data)

dataset0 = load_json_gz_to_dataframe(file_path)
print(dataset0.head())

Import .labelled file and join with data

In [None]:
data_info = pd.read_csv('data.info.labelled')
# Join on transcript id and transcript position
full_data = pd.merge(dataset0,data_info,
    left_on = ['transcript', 'position'],
    right_on = ['transcript_id', 'transcript_position'],
    how = 'left')
full_data = full_data.drop(columns=['transcript_id', 'transcript_position'])
print(full_data.head())

Selected data for initial modelling

In [None]:
# Taking mean of list of list
mean_data = full_data['data'].apply(lambda x: np.mean(x, axis=0))
selected_data = full_data.copy()
selected_data['data'] = mean_data

# filtering middle data (only include data for the position)
selected_data['data'] = selected_data['data'].apply(lambda x: np.array(x)[3:6])
print(selected_data.head())

Export full data into csv

In [20]:
# full_data.to_csv('full_data.csv', index=False)
# selected_data.to_csv('selected_data.csv', index=False)

Logistic Regression (Imbalance Data)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# convert data into a matrix
X = pd.DataFrame(selected_data['data'].tolist(), index=selected_data.index)
y = selected_data['label']

# 70/30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# fitting logistic regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# predict with test set
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# imbalance data
selected_data['label'].value_counts()

In [None]:
from imblearn.over_sampling import SMOTE

# simulate data with label 1 using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# new fit
log_reg = LogisticRegression()
log_reg.fit(X_resampled, y_resampled)

# new prediction show drastic drop in accuracy (previous high accuracy due to imbalanced data)
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')