In [None]:
import re
from itertools import product
from collections import defaultdict
from typing import List, Dict, Tuple, Union, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)

In [None]:
def load_splice_data(filepath: str) -> pd.DataFrame:
    data = pd.read_csv(filepath)
    data["instance_name"] = data["instance_name"].str.strip()
    data["sequence"] = data["sequence"].apply(
        lambda seq: "".join(
            [nl if nl in "ATCGN" else "N" for nl in seq.strip()]
        )
    )
    return data


def kmer_dataset(data: pd.DataFrame, kmer_size: int) -> pd.DataFrame:
    unique_kmers = sorted(
        ["".join(seq) for seq in product("ACTGN", repeat=kmer_size)]
    )
    results = {kmer: [] for kmer in unique_kmers}
    results.update({"label": []})
    
    for _, row in data.iterrows():
        seq = row["sequence"]
        label = row["label"]
        results["label"].append(label)
        counts = {kmer: 0 for kmer in unique_kmers}
        for i in range(0, len(seq) - kmer_size, kmer_size):
            sub_seq = seq[i: i + kmer_size]
            counts[sub_seq] += 1
        for sub_seq, count in counts.items():
            results[sub_seq].append(count)
    
    return pd.DataFrame(data=results)

In [None]:
splice_datapath = "../data/splice.csv"
splice_df = load_splice_data(splice_datapath)
splice_kmers_df = kmer_dataset(splice_df, 4)


In [None]:
from sklearn.model_selection import train_test_split

X = splice_kmers_df.loc[:, splice_kmers_df.columns != "label"]
y = splice_kmers_df["label"]
print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=17)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


clf = MultinomialNB(force_alpha=True)
print("training ...")
clf.fit(X_train, y_train)
print("predicting...")
y_pred = clf.predict(X_test)
print("scoring...")
A = accuracy_score(y_test, y_pred)
P = precision_score(y_test, y_pred, average="macro")
R = recall_score(y_test, y_pred, average="macro")
F = f1_score(y_test, y_pred, average="macro")

print(f"Accuracy: {A*100:.2f}%")
print(f"Precision: {P*100:.2f}%")
print(f"Recall: {R*100:.2f}%")
print(f"F1: {F*100:.2f}%")