In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt

In [None]:
dsets = pd.read_csv("./dset_guide.csv")

In [None]:
gly_dsets = dsets[dsets["experiment"] == "glycerol_time_course"]
dset_lookup = gly_dsets[["dataSet", "growthTime_hr"]]
dset_lookup = dset_lookup.reset_index(drop=True)

In [None]:
gly_dsets

In [None]:
dset_lookup["dataSet"].tolist()

In [None]:
dset_dict = dset_lookup.set_index("dataSet").to_dict()["growthTime_hr"]
dset_dict["Unnamed: 0"] = 0.0

In [None]:
dset_dict

In [None]:
mrna_dset = pd.read_csv("./mrna_data.csv")
column_list = ["Unnamed: 0"] + dset_lookup["dataSet"].tolist()
column_list.remove("MURI_060")
column_list.remove("MURI_051")
mrna_dset = mrna_dset[column_list]
dset_row = pd.DataFrame(mrna_dset.columns.tolist()).T
dset_row.columns = mrna_dset.columns
mrna_dset = pd.concat([dset_row, mrna_dset]).reset_index(drop=True)
mrna_dset.loc[0] = mrna_dset.loc[0].apply(lambda x: dset_dict[x])
mrna_dset = mrna_dset.transpose().reset_index(drop=True)

variable_list = mrna_dset[0:1].values.tolist()[0]
variable_list[0] = "time"

mrna_dset.columns = variable_list
mrna_dset = mrna_dset.drop(0).reset_index(drop=True)

In [None]:
mrna_dset["ECB_01243"]

In [None]:
mrna_dset

In [None]:
tidy_mrna_dset = pd.melt(
    mrna_dset, id_vars=["time"], value_name="norm_count", var_name="gene"
)

In [None]:
tidy_mrna_dset

In [None]:
gene_group = tidy_mrna_dset.groupby(["gene", "time"])
mean_gene = gene_group.apply(lambda x: np.mean(x["norm_count"]))
mean_gene = pd.DataFrame(mean_gene)

In [None]:
mean_gene_group = mean_gene.groupby(["gene"])
time_vectors = mean_gene_group.apply(lambda x: np.array(x[0]))
time_vectors = np.array([item.tolist() for item in time_vectors.values])
time_vectors = time_vectors[:, :-2]  # early stationary

In [None]:
plt.plot(mean_gene.loc["ECB_01243"].values[:-2])

In [None]:
time_vectors

In [None]:
max_norm_time_vectors = (time_vectors - np.min(time_vectors, axis=1)[:, np.newaxis]) / (
    np.max(time_vectors, axis=1)[:, np.newaxis]
    - np.min(time_vectors, axis=1)[:, np.newaxis]
    + 0.001
)

In [None]:
normed_time_vectors = time_vectors

In [None]:
for i in range(10):
    plt.plot(max_norm_time_vectors[i])
plt.show()

In [None]:
	growthTime_hr
34	5
35	7
36	8
37	10
38	14
39	24
40	48
41	168
42	336
[5,7,8,10,14,24,48,168,336]

In [None]:
X = max_norm_time_vectors
kmeans = KMeans(n_clusters=20, random_state=0).fit(X)

In [None]:
centers = kmeans.cluster_centers_

In [None]:
centers.shape

In [None]:
for i in range(0, 5):
    plt.plot(centers[i], label=i)
plt.legend()
plt.show()

In [None]:
for i in range(5, 10):
    plt.plot(centers[i], label=i)
plt.legend()
plt.show()

In [None]:
for i in range(10, 15):
    plt.plot(centers[i], label=i)
plt.legend()
plt.show()

In [None]:
for i in range(15, 20):
    plt.plot(centers[i], label=i)
plt.legend()
plt.show()

In [None]:
cluster_0_idx = np.where(kmeans.labels_ == 15)[0]

In [None]:
cluster_0 = X[kmeans.labels_ == 15]

In [None]:
((cluster_0 - centers[15]) ** 2).shape

In [None]:
center_dist = np.sum((cluster_0 - centers[15]) ** 2, axis=1) ** (1 / 2)

In [None]:
plt.hist(center_dist)

In [None]:
cluster_0_similar_idx = cluster_0_idx[center_dist < 0.5]

In [None]:
cluster_0_similar = cluster_0[center_dist < 0.5]

In [None]:
for i in range(cluster_0_similar.shape[0]):
    plt.plot([5, 7, 8, 10, 14, 24, 48], cluster_0_similar[i], label=i)
plt.legend()
plt.show()

In [None]:
import seaborn as sns

In [None]:
y = cluster_0_similar.flatten()
x = np.tile(list(range(cluster_0_similar.shape[1])), cluster_0_similar.shape[0])

In [None]:
x

In [None]:
sns.lineplot(x=x, y=y, ci="sd")

In [None]:
cluster_0_similar_idx

In [None]:
gene_list = np.array(mean_gene.index.get_level_values("gene").unique().tolist())
cluster_0_gene_list = gene_list[cluster_0_similar_idx]

In [None]:
gene_list[cluster_0_similar_idx]

In [None]:
annotation_list = [
    "acyl-CoA synthetase FdrA",
    "molybdate ABC transporter substrate-binding protein",
    "IS1 protein InsA",
    "2-hydroxyglutaryl-CoA dehydratase activator",
]

array(['ECB_00040', 'ECB_00468', 'ECB_00716', 'ECB_01243', 'ECB_02170',
       'ECB_02674', 'ECB_02712', 'ECB_02717', 'ECB_03271', 'ECB_03837',
       'ECB_04061', 'ECB_04203'], dtype='<U9')
       
crotonobetainyl-CoA hydratase
acyl-CoA synthetase FdrA
molybdate ABC transporter substrate-binding protein
IS1 protein InsA
ISNCY family transposase
prepilin peptidase-dependent protein
putative aminohydrolase SsnA
guanine/hypoxanthine transporter GhxQ
RNA 3'-terminal phosphate cyclase
[formate-C-acetyltransferase]-activating enzyme
PTS ascorbate transporter subunit IIB
2-hydroxyglutaryl-CoA dehydratase activator

In [None]:
from Bio import SeqIO

record = SeqIO.read("sequence (2).gb", "genbank")
record

In [None]:
upstream_len = 40

cluster_0_features = []
for feature in record.features[15:]:
    try:
        if (
            feature.qualifiers["old_locus_tag"][0] in cluster_0_gene_list
            and feature.type == "CDS"
        ):
            cluster_0_features.append(feature)
    except:
        pass

feature_seqs = []
for feature in cluster_0_features:
    if feature.strand == -1:
        seq = (
            record[feature.location.end : feature.location.end + upstream_len]
            .reverse_complement()
            .seq
        )
    else:
        seq = record[feature.location.start - upstream_len : feature.location.start].seq
    feature_seqs.append(str(seq))

In [None]:
cluster_0_features

In [None]:
seq_arr = np.array([list(item) for item in feature_seqs])

In [None]:
As = np.sum(seq_arr == "A", axis=0)
Cs = np.sum(seq_arr == "C", axis=0)
Gs = np.sum(seq_arr == "G", axis=0)
Ts = np.sum(seq_arr == "T", axis=0)
ttl = As + Cs + Gs + Ts
freq_arr = (np.array([As, Cs, Gs, Ts]) / ttl).T

In [None]:
As.shape

In [None]:
import seqlogo

ppm = seqlogo.Ppm(freq_arr)
ppm

In [None]:
seqlogo.seqlogo(ppm, ic_scale=False, format="png", size="large")

In [None]:
from Bio import motifs

In [None]:
motifs.

In [None]:
feature_seqs = []
for feature in cluster_0_features:
    if feature.strand == -1:
        seq = (
            record[feature.location.start : feature.location.end]
            .reverse_complement()
            .seq
        )
    else:
        seq = record[feature.location.start : feature.location.end].seq
    feature_seqs.append(seq)

In [None]:
gene_list

In [None]:
feature.qualifiers["old_locus_tag"][0]

In [None]:
for i in range(centers.shape[0]):
    plt.plot([5, 7, 8, 10, 14, 24, 48], centers[i])
    plt.show()

In [None]:
time_vectors[-1]

In [None]:
mrna_dset[column_list]

#### Notes

Ok, so there are clearly some interesting clusters in these data. Consider a deeper dive later (OD, proteomics)
