In [None]:
import pandas as pd
import json
import gzip
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import numpy as np



Read in Files

In [None]:
a549_df = pd.read_csv('../SGNex_A549_directRNA_replicate5_run1_data.csv')

mcf7_df = pd.read_csv('../SGNex_MCF7_directRNA_replicate3_run1_data.csv')

k562_df = pd.read_csv('../SGNex_K562_directRNA_replicate4_run1_data.csv')

hepg2_df = pd.read_csv('../SGNex_HepG2_directRNA_replicate6_run1_data.csv')

hct116_df = pd.read_csv('../SGNex_Hct116_directRNA_replicate3_run1_data.csv')


In [None]:

def clean_and_process_df(df):
    # Step 1: Drop the unnecessary columns
    df = df.drop(columns=["ori_nucleotide", "fivemer_neg_1", "fivemer_0", "fivemer_1"])

    # Step 2: Group by 'transcript_id' and 'transcript_position' and calculate the necessary aggregations
    averaged_df = df.groupby(['transcript_id', 'transcript_position']).agg({
        'dwelling_time_neg_1': ['mean', 'min', 'max', 'median'],
        'dwelling_time_0': ['mean', 'min', 'max', 'median'],
        'dwelling_time_1': ['mean', 'min', 'max', 'median'],
        'mean_neg_1': ['mean', 'min', 'max', 'median'],
        'mean_0': ['mean', 'min', 'max', 'median'],
        'mean_1': ['mean', 'min', 'max', 'median'],
        'sd_neg_1': ['mean', 'min', 'max', 'median'],
        'sd_0': ['mean', 'min', 'max', 'median'],
        'sd_1': ['mean', 'min', 'max', 'median']
    }).reset_index()

    # Step 3: Rename columns to a consistent format
    averaged_df.columns = [
        f'{col[0]}_{col[1]}' if col[1] else col[0]  # Join column names with '_'
        for col in averaged_df.columns
    ]

    # Step 4: Add rolling differences for 'mean' values
    def add_rolling(df):
        # Define the pairs of columns to calculate differences for each metric
        calculations = {
            'dwelling_time': [('dwelling_time_neg_1_mean', 'dwelling_time_0_mean'), 
                              ('dwelling_time_0_mean', 'dwelling_time_1_mean')],
            'mean': [('mean_neg_1_mean', 'mean_0_mean'), 
                     ('mean_0_mean', 'mean_1_mean')],
            'sd': [('sd_neg_1_mean', 'sd_0_mean'), 
                   ('sd_0_mean', 'sd_1_mean')]
        }

        for metric, pairs in calculations.items():
            for first, second in pairs:
                # Create a new column with a unique name for the difference
                df[f'{metric}_diff_{first}_{second}'] = df[first] - df[second]

        return df

    # Step 5: Apply the rolling difference function to the averaged DataFrame
    averaged_df = add_rolling(averaged_df)

    return averaged_df

# Example of how to use this function for multiple dataframes
# cleaned_dfs = [clean_and_process_df(df) for df in list_of_dfs]


In [None]:
a549_cleaned = clean_and_process_df(a549_df)
mcf7_cleaned = clean_and_process_df(mcf7_df)
k562_cleaned = clean_and_process_df(k562_df)
hepg2_cleaned = clean_and_process_df(hepg2_df)
hct116_cleaned = clean_and_process_df(hct116_df)

# Add label columns
a549_cleaned['source'] = 'A549'
mcf7_cleaned['source'] = 'MCF7'
k562_cleaned['source'] = 'K562'
hepg2_cleaned['source'] = 'HepG2'
hct116_cleaned['source'] = 'Hct116'

# Combine all the cleaned dataframes into one mega dataframe
mega_df = pd.concat([a549_cleaned, mcf7_cleaned, k562_cleaned, hepg2_cleaned, hct116_cleaned], axis=0)

# Reset index after concatenation
mega_df.reset_index(drop=True, inplace=True)

In [None]:
scaler = StandardScaler()

excluded_columns = ['transcript_id', 'transcript_position', 'source']

# List of columns to scale
scaled_columns = [col for col in mega_df.columns if col not in excluded_columns]

# Scale the selected columns
mega_df[scaled_columns] = scaler.fit_transform(mega_df[scaled_columns])
# Check the modified DataFrame
mega_df.head()


# Prepare filtered dataset

In [None]:
mega_df_filtered = mega_df[['sd_0_mean',
 'mean_0_mean',
 'mean_diff_mean_0_mean_mean_1_mean',
 'sd_0_median',
 'mean_0_median',
 'mean_diff_mean_neg_1_mean_mean_0_mean',
 'mean_1_mean',
 'sd_0_min',
 'mean_neg_1_mean',
 'sd_diff_sd_0_mean_sd_1_mean']]

In [None]:
# Step 1: Load the model
model = tf.keras.models.load_model('../cnn_selected.keras')

# Step 2: Prepare the data
X_test = mega_df_filtered.values  # Convert to NumPy array if it's a DataFrame

# Step 3: Make predictions
predictions = model.predict(X_test)

# Step 4: Convert probabilities to binary outcomes
binary_predictions = (predictions > 0.5).astype(int)

# Ensure that binary_predictions is a 1D array to match the DataFrame rows
binary_predictions = binary_predictions.flatten()

# Use .loc to assign predictions safely
mega_df.loc[:, 'predictions'] = binary_predictions


In [None]:
# Unscale the selected columns (returning to original values)
mega_df[scaled_columns] = scaler.inverse_transform(mega_df[scaled_columns])

# Check the unstandardized DataFrame
print(mega_df.head())

In [None]:
# mega_df.to_csv('../results_mega_df.csv', index=False)

# Conduct EDA

In [None]:
import pandas as pd
df = pd.read_csv('/Users/rachels/Desktop/NUS/Y4/DSA4262/Team_Project/data/Task 2/merged/results_mega_df.csv', sep=',')
df['status'] = df['predictions'].apply(lambda x: 'modified' if x == 1 else "unmodified")

### Comparison across modified/unmodified for means across cell lines

In [None]:
# Create a figure with 1 row and 3 columns
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

# List of columns to plot
columns = ["mean_neg_1_mean", 'mean_0_mean', 'mean_1_mean']
titles = ['Mean_neg_1', 
          'Mean_0', 
          'Mean_1']

# Loop through each axis and column to create boxplots
for ax, col, title in zip(axes, columns, titles):
    sns.violinplot(x='source', y=col, hue='status', data=df, ax=ax, palette=['lightgreen', 'lightblue'])
    ax.set_title(title)
    ax.set_xlabel('Cell Line')
    ax.set_ylabel('Mean Value')
    ax.legend(loc='upper right')

plt.suptitle('Comparison of Modifications across Cell Lines', fontsize=20, fontweight='bold', y=1.01)
plt.tight_layout()
plt.show()


#### Comparison across modified/unmodified for sd across cell lines

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create a figure with 1 row and 3 columns
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

# List of columns to plot
columns = ["sd_neg_1_mean", 'sd_0_mean', 'sd_1_mean']
titles = ['sd_neg_1', 
          'sd_0', 
          'sd_1']

# Loop through each axis and column to create boxplots
for ax, col, title in zip(axes, columns, titles):
    sns.violinplot(x='source', y=col, hue='status', data=df, ax=ax, palette=['lightgreen', 'lightblue'])
    ax.set_title(title)
    ax.set_ylim(0, 10)
    ax.set_xlabel('Cell Line')
    ax.set_ylabel('SD Value')
    ax.legend(loc='upper right')

plt.suptitle('Comparison of Modifications across Cell Lines', fontsize=20, fontweight='bold', y=1.01)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()
