In [3]:
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
path_mp4_files = os.path.expanduser("~/out_stats_fastq/")
csv_files = glob.glob(os.path.join(path_mp4_files, '**', '*.csv'), recursive=True)

df_dict_thresh = {}
df_dict_split = {}

for file in csv_files:
    id_file = file.split("_")[-3]
    thresh_file = file.split("_")[-1].rstrip('.csv')
    process_type = file.split("_")[-2]

    # Read csv file into dataframe
    df = pd.read_csv(file)

    # Add id_file as a column in the dataframe
    df['id_file'] = id_file
    df['thresh_file'] = thresh_file
    df["process_type"] = process_type

    # Change the index for dataframes where process_type == 'trash'
    if process_type == 'trash' and "index" in df.columns:
        df["index"] = [index.split(".")[1] for index in df["index"]]

    # Set id_file as the index of the dataframe
    df.set_index('id_file', inplace=True)

    # Create a key for the dictionary
    key_thresh = (id_file, thresh_file, process_type)

    # If key exists, append the data. Otherwise, store this df in dictionary.
    if key_thresh in df_dict_thresh:
        df_dict_thresh[key_thresh] = pd.concat([df_dict_thresh[key_thresh], df], ignore_index=True)
    else:
        df_dict_thresh[key_thresh] = df

    # Change the index for dataframes where process_type == 'trash'
    if process_type == 'split' and "index" in df.columns:
        df['group'] = df['index'].str.split('-').str[2]
        group = df['group'].iloc[0]

    key_split = (id_file, thresh_file, process_type, group)

    if key_split in df_dict_split:
        df_dict_split[key_split] = pd.concat([df_dict_split[key_split], df], ignore_index=True)
    else:
        df_dict_split[key_split] = df

In [None]:
thresholds = [0, 0.001, 0.01, 0.1, 1]
columns = df.columns[2:6]  # update this to match your data
ylabel = ['Jaccard-index', 'Bray–Curtis', 'l1-score', 'l2-score']

# Loop over each threshold
for threshold in thresholds:
    # Loop over each column
    for i, column in enumerate(columns):
        # Initialize figure and axes for each plot
        fig, ax = plt.subplots(figsize=(16,9))

        new_array = np.array(np.append(df['index'].values[::], ["200M","300M"]))

        for key, df in df_dict_thresh.items():
            if (df["process_type"] == 'trash').any() and (df["thresh_file"] == str(threshold)).any():
                ax.plot(df['index'], df[column], label=key[0], marker='o')

        # Set xticks with labels
        ax.set_xticks(range(len(new_array)))  # Set xticks as range of new_array length
        ax.set_xticklabels(new_array, rotation=0)  # Set xtick labels as 'new_array' values

        # Set plot title, x-axis and y-axis labels
        ax.set_title(f'{ylabel[i]} for {threshold}% threshold')
        ax.set_xlabel('Index')
        ax.set_ylabel(ylabel[i])

        # Add a legend to the plot
        ax.legend()

        # Display the plot
        plt.tight_layout()
        plt.show()

In [None]:
thresholds = [0, 0.001, 0.01, 0.1, 1]
columns = ['jaccard_scores', 'Dissimilarity', 'l1_score', 'l2_score']
ylabel = ['Jaccard-index', 'Bray–Curtis', 'l1-score', 'l2-score']

# Loop over each threshold
for threshold in thresholds:
    # Loop over each column
    for i, column in enumerate(columns):
        # Initialize figure and axes for each plot
        fig, ax = plt.subplots(figsize=(16,9))

        # List to store data for boxplot and corresponding labels
        data_to_plot = []
        labels = []

        # Loop over each key, dataframe in df_dict_split dictionary
        for key, df_split in df_dict_thresh.items():
            if (df_split["process_type"] == 'split').any() and (df_split["thresh_file"] == str(threshold)).any():
                # Add df[column] data and corresponding label to the lists
                data_to_plot.append(df_split[column].values)
                labels.append(', '.join(map(str, list(set(df_split.index)))))

        # Plot boxplot for data_to_plot
        sns.boxplot(data=data_to_plot, ax=ax)

        # Set plot title, x-axis and y-axis labels
        ax.set_xticklabels(labels, rotation=45)
        ax.set_title(f'{ylabel[i]} for {threshold}% threshold')
        ax.set_xlabel('id_file')
        ax.set_ylabel(ylabel[i])

        # Display the plot
        plt.tight_layout()
        plt.show()

In [None]:
thresholds = [0, 0.001, 0.01, 0.1, 1]
columns = ['jaccard_scores', 'Dissimilarity', 'l1_score', 'l2_score']
ylabel = ['Jaccard-index', 'Bray–Curtis', 'l1-score', 'l2-score']

# Loop over each threshold
for threshold in thresholds:
    # Loop over each column
    for i, column in enumerate(columns):
        # Loop over each key, dataframe in df_dict_split dictionary
        for key, df_split in df_dict_thresh.items():
            if (df_split["process_type"] == 'split').any() and (df_split["thresh_file"] == str(threshold)).any():
                # Initialize figure and axes for each plot
                fig, ax = plt.subplots(figsize=(16,9))
                # print(df_split.index.unique())
                # Plot boxplot for df_split[column] with 'group' column as x-axis
                sns.boxplot(x='group', y=column, data=df_split, ax=ax)

                # Set plot title, x-axis and y-axis labels
                ax.set_title(f'{df_split.index.unique()[0]} - {ylabel[i]} for {threshold}% threshold')
                ax.set_xlabel('Group')
                ax.set_ylabel(ylabel[i])

                # Display the plot
                plt.tight_layout()
                plt.show()

In [None]:
# Create a dictionary to hold dictionaries for each threshold
threshold_dfs = {threshold: {} for threshold in thresholds}

# Loop over each threshold
for threshold in thresholds:
    # Loop over each key, dataframe in df_dict_split dictionary
    for key, df in df_dict_thresh.items():
        if (df["thresh_file"] == str(threshold)).any():
            df.loc[df['Dissimilarity'] <= 0.01, 't_Dissimilarity'] = 1
            df.loc[df['Dissimilarity'] > 0.01, 't_Dissimilarity'] = 0

            if df["process_type"].str.contains('split').any():
                df['group'] = df['group'].apply(lambda x: str(int(x.split('M')[0]) * 10) + 'M')
                df.loc[df['jaccard_scores'] <= 0.95, 't_jaccard_scores'] = 1
                df.loc[df['jaccard_scores'] > 0.95, 't_jaccard_scores'] = 0
            elif df["process_type"].str.contains('trash').any():
                df.rename(columns={'index':'group'}, inplace=True)
                df.loc[df['Ground_truth'] <= 0.95, 't_jaccard_scores'] = 1
                df.loc[df['Ground_truth'] > 0.95, 't_jaccard_scores'] = 0

            df = df[['group', 't_Dissimilarity', 't_jaccard_scores']]

            # Add the modified dataframe to the relevant dictionary
            threshold_dfs[threshold][key] = df

    # Sort dictionaries by key within threshold_dfs
    threshold_dfs[threshold] = {k: threshold_dfs[threshold][k] for k in sorted(threshold_dfs[threshold])}

In [None]:
# Prepare the label encoder
le = LabelEncoder()

# Define a dictionary to store predictions for each threshold
predictions = {}

# Iterate over each threshold
for threshold in thresholds:
    # Define the model
    model = RandomForestClassifier(n_estimators=100)

    # Get the dataframes for this threshold
    threshold_dict = threshold_dfs[threshold]

    # Select the first 10 dataframes
    dfs_to_train = [df for key, df in sorted(threshold_dict.items())[:10]]

    # Concatenate the dataframes into one
    train_data = pd.concat(dfs_to_train)

    # Separate features and target
    X = train_data[['t_Dissimilarity', 't_jaccard_scores']]
    y = le.fit_transform(train_data['group'])

    # Fit the model
    model.fit(X, y)

    # Make a prediction for a single value
    single_value_to_predict = [[0.05, 0.95]]  # replace with your data
    prediction = model.predict(single_value_to_predict)

    # Convert the predicted label back to original label
    predicted_label = le.inverse_transform(prediction)

    # Store prediction for this threshold
    predictions[threshold] = predicted_label

# Print all predictions
print(predictions)

In [None]:
# Define the model
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(2, 32)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(32, 16)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(16, 1)
        self.output = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.output(x)
        return x


criterion = nn.BCELoss()

le = LabelEncoder()

results = {}

# Iterate over each threshold
for threshold in thresholds:
    model = MyModel()

    optimizer = optim.Adam(model.parameters())

    threshold_dict = threshold_dfs[threshold]

    dfs_to_train = [df for key, df in sorted(threshold_dict.items())[:10]]

    train_data = pd.concat(dfs_to_train)

    X = torch.Tensor(train_data[['t_Dissimilarity', 't_jaccard_scores']].values)
    y = torch.Tensor(le.fit_transform(train_data['group'])).reshape(-1, 1)

    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=10, shuffle=True)

    for epoch in range(150):
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
    single_value_to_predict = torch.Tensor([[0.05, 0.9]])
    with torch.no_grad():
        model.eval()
        prediction = model(single_value_to_predict)
    predicted_label = le.inverse_transform([round(prediction.item())])

    results[threshold] = predicted_label

print(results)