In [None]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler

%load_ext autoreload
%autoreload 2

In [None]:
def read_csv_file(file_path):
    try:
        return pd.read_csv(file_path, delimiter=";", encoding="utf-8", low_memory=False)
    except pd.errors.ParserError:
        print(f"Error parsing {file_path}")
        return pd.DataFrame()


def read_csv_file_with_filename(file_path):
    try:
        df = pd.read_csv(file_path, delimiter=";", encoding="utf-8", low_memory=False)
        df["csv"] = os.path.basename(file_path)
    except pd.errors.ParserError:
        print(f"Error parsing {file_path}")
        return pd.DataFrame()
    return df

data_path = "/content/drive/MyDrive/MLRG/data_expression/data_expression"
all_files = [
    os.path.join(data_path, file)
    for file in os.listdir(data_path)
    if file.endswith(".csv")
]

df_list = [read_csv_file_with_filename(file) for file in all_files]
expression_df = pd.concat(df_list, ignore_index=True)
expression_df.reset_index(drop=True, inplace=True)

In [None]:
def clean_column_name(col_name):
    """
    clean and format column name by replacing spaces and special characters with underscores and converting to lowercase.

    Parameters:
    col_name (str): The original column name.

    Returns:
    str: The cleaned and formatted column name.
    """
    col_name = col_name.replace("_", " ")
    col_name = col_name.replace("-", " ")
    col_name = re.sub(r"[^\w\s]", " ", col_name)
    col_name = re.sub(r"\s+", " ", col_name)
    # Replace non-word characters (except for spaces) with nothing
    col_name = re.sub(r"[^\w\s]", "", col_name)
    # Replace spaces with underscores
    col_name = col_name.replace(" ", "_")
    # Convert to lowercase
    cleaned_name = col_name.lower()
    return cleaned_name


def rename_columns(df):
    """
    Rename all columns of the DataFrame to a more convenient format.

    Parameters:
    df (pd.DataFrame): The DataFrame whose columns are to be renamed.

    Returns:
    pd.DataFrame: DataFrame with renamed columns.
    """
    # Create a dictionary to map old column names to new column names
    new_columns = {col: clean_column_name(col) for col in df.columns}

    # Rename columns in the DataFrame
    df.rename(columns=new_columns, inplace=True)

    return df


# Rename all columns to a more convenient format
expression_df = rename_columns(expression_df)
tpm_columns = [col for col in expression_df.columns if "tpm" in col]
expression_df = expression_df[['species', 'csv', 'chromosome', 'region'] + tpm_columns]

In [None]:
expression_df

Unnamed: 0,species,csv,chromosome,region,legip_as_1_ge_tpm,legip_as_2_ge_tpm,legip_as_3_ge_tpm,legip_bs_1_ge_tpm,legip_bs_2_ge_tpm,legip_bs_3_ge_tpm,...,mssa_oxs_3_ge_tpm,mssa_sp_1_ge_tpm,mssa_sp_2_ge_tpm,mssa_sp_3_ge_tpm,mssa_tm_1_ge_tpm,mssa_tm_2_ge_tpm,mssa_tm_3_ge_tpm,mssa_vic_1_ge_tpm,mssa_vic_2_ge_tpm,mssa_vic_3_ge_tpm
0,Legionella pneumophila subsp. pneumophila Phil...,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,654..2012,580.439919,602.800116,574.695008,,,,...,,,,,,,,,,
1,Legionella pneumophila subsp. pneumophila Phil...,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,2026..3129,421.775789,418.556449,414.811704,,,,...,,,,,,,,,,
2,Legionella pneumophila subsp. pneumophila Phil...,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,3126..4187,80.058162,79.188850,81.805868,,,,...,,,,,,,,,,
3,Legionella pneumophila subsp. pneumophila Phil...,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,4482..6902,430.112152,428.536725,434.874920,,,,...,,,,,,,,,,
4,Legionella pneumophila subsp. pneumophila Phil...,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,complement(7271..8317),208.971696,189.009620,202.627915,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105083,Staphylococcus aureus MSSA476,Staphylococcus aureus MSSA476.csv,NC_005951,15874..16719,,,,,,,...,484.741689,807.428499,1017.898103,886.394443,3341.997864,3225.268694,2834.525864,897.134109,720.273906,686.861587
105084,Staphylococcus aureus MSSA476,Staphylococcus aureus MSSA476.csv,NC_005951,17088..17621,,,,,,,...,1.417775,0.589486,2.293919,3.022411,5.856298,2.355629,2.839408,0.789759,3.438108,0.995357
105085,Staphylococcus aureus MSSA476,Staphylococcus aureus MSSA476.csv,NC_005951,18035..18328,,,,,,,...,217.742619,1778.430941,2306.161078,1668.178253,357.788302,393.019150,356.790884,724.641187,517.687874,491.294982
105086,Staphylococcus aureus MSSA476,Staphylococcus aureus MSSA476.csv,NC_005951,18403..20337,,,,,,,...,69.775054,75.320765,65.204204,73.921385,38.493913,35.290084,42.527493,87.470160,95.260649,101.840447


In [None]:
upstream_df = pd.read_excel(
    "/content/drive/MyDrive/MLRG/data_sequences_upstream/data_sequences_upstream/upstream_sequences.xlsx"
)
upstream_df.rename(columns={"contig": "chromosome"}, inplace=True)

In [None]:
replacement_dict = {
    "Staphylococcus��aureus MRSA252.csv": "Staphylococcus\xa0aureus MRSA252.csv",
    "Staphylococcus��aureus MSSA476.csv": "Staphylococcus\xa0aureus MSSA476.csv",
    "Staphylococcus��epidermidis 1457.csv": "Staphylococcus\xa0epidermidis 1457.csv",
}
upstream_df["csv"] = upstream_df["csv"].replace(replacement_dict)
assert len(set(upstream_df["csv"]).difference(set(expression_df["csv"]))) == 0
assert len(set(upstream_df[["csv", "region"]]).difference(set(expression_df[["csv", "region"]]))) == 0

species_with_regions_exp = list(zip(expression_df['csv'], expression_df['region']))
species_with_regions_upstream = list(zip(upstream_df['csv'], upstream_df['region']))

exp_set = set(species_with_regions_exp)
upstream_set = set(species_with_regions_upstream)

In [None]:
len(exp_set.difference(upstream_set))

8677

In [None]:
merged_df = pd.merge(
        expression_df, upstream_df, on=["csv", "region"], how="left"
    )
merged_df["upstream200"].fillna("", inplace=True)
# Define the valid characters
valid_chars = set('ATCG')

# Function to check if a sequence is valid
def is_valid_sequence(seq):
    return len(seq)> 0 and set(seq).issubset(valid_chars)

# Apply the function to each sequence and get indices of invalid sequences
invalid_indices = merged_df[~merged_df['upstream200'].apply(is_valid_sequence)].index.tolist()

# Drop the invalid sequences
merged_df = merged_df.drop(invalid_indices)
merged_df.dropna(subset=['species'], inplace=True)

In [None]:
merged_df['is_complement'] = merged_df['region'].str.contains('complement')

def complement_dna(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    return ''.join(complement[base] for base in sequence)

# Apply the complement_dna function to the upstream200 column where is_complement is True
merged_df['upstream200'] = merged_df.apply(
    lambda row: complement_dna(row['upstream200']) if row['is_complement'] else row['upstream200'],
    axis=1
)
merged_df

Unnamed: 0,species,csv,chromosome_x,region,legip_as_1_ge_tpm,legip_as_2_ge_tpm,legip_as_3_ge_tpm,legip_bs_1_ge_tpm,legip_bs_2_ge_tpm,legip_bs_3_ge_tpm,...,mssa_sp_3_ge_tpm,mssa_tm_1_ge_tpm,mssa_tm_2_ge_tpm,mssa_tm_3_ge_tpm,mssa_vic_1_ge_tpm,mssa_vic_2_ge_tpm,mssa_vic_3_ge_tpm,chromosome_y,upstream200,is_complement
0,Legionella pneumophila subsp. pneumophila Phil...,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,654..2012,580.439919,602.800116,574.695008,,,,...,,,,,,,,NC_002942,TATAATTTATTGATTACTCAGATAACATATAGATTGAAATCTTTTT...,False
1,Legionella pneumophila subsp. pneumophila Phil...,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,2026..3129,421.775789,418.556449,414.811704,,,,...,,,,,,,,NC_002942,ACGGCAAATGGCAATGGCTTTGAGTAAAGAGTTGACCAATCATAGT...,False
2,Legionella pneumophila subsp. pneumophila Phil...,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,3126..4187,80.058162,79.188850,81.805868,,,,...,,,,,,,,NC_002942,GAAGAGGCAATTGAAACATTGAGTGCCGAAACACAAGGTGATGAAC...,False
3,Legionella pneumophila subsp. pneumophila Phil...,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,4482..6902,430.112152,428.536725,434.874920,,,,...,,,,,,,,NC_002942,AAGGTAAACATATGATATTCTACAAATGGCTCATCAAATGTCATGG...,False
4,Legionella pneumophila subsp. pneumophila Phil...,Legionella pneumophila subsp. pneumophila Phil...,NC_002942,complement(7271..8317),208.971696,189.009620,202.627915,,,,...,,,,,,,,NC_002942,AAACTCGACCTACTAAATCAGTCGCCGCTGTGGCATTGTTTGCACA...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105263,Staphylococcus aureus MSSA476,Staphylococcus aureus MSSA476.csv,NC_005951,15874..16719,,,,,,,...,886.394443,3341.997864,3225.268694,2834.525864,897.134109,720.273906,686.861587,NC_005951,AATTAAAATAGCGTTTTAATATGTACCTAAAAAAGACTAACAATAG...,False
105264,Staphylococcus aureus MSSA476,Staphylococcus aureus MSSA476.csv,NC_005951,17088..17621,,,,,,,...,3.022411,5.856298,2.355629,2.839408,0.789759,3.438108,0.995357,NC_005951,AATCATAAATAATAGATGAATAGTTTAATTATAGGTGTTCATCAAT...,False
105265,Staphylococcus aureus MSSA476,Staphylococcus aureus MSSA476.csv,NC_005951,18035..18328,,,,,,,...,1668.178253,357.788302,393.019150,356.790884,724.641187,517.687874,491.294982,NC_005951,GGTAGTGAAATTGTATTTATTATACATTGTCATATCGTTGTTTAAT...,False
105266,Staphylococcus aureus MSSA476,Staphylococcus aureus MSSA476.csv,NC_005951,18403..20337,,,,,,,...,73.921385,38.493913,35.290084,42.527493,87.470160,95.260649,101.840447,NC_005951,AATCATGGTTCAACAGCAGTAGGGAAATATTCATCTTTTAGTGGTG...,False


In [None]:
merged_df = merged_df[['species', 'upstream200'] + tpm_columns]
merged_df.reset_index(drop=True, inplace=True)
merged_df.to_csv("/content/drive/MyDrive/MLRG/merged.csv")
print("Number of species: ", len(set(merged_df["species"].tolist())))
merged_df = pd.read_csv("/content/drive/MyDrive/MLRG/merged.csv", index_col=0)
melted_df = merged_df.melt(var_name='condition', value_name='tpm', id_vars=['species', 'upstream200'])
melted_df.dropna(subset=['tpm'], inplace=True)
melted_df['condition'] = melted_df['condition'].str.replace('_ge_tpm', '')
melted_df[['stress_condition', 'evaluation']] = melted_df['condition'].str.rsplit('_', n=1, expand=True)
melted_df.drop(columns=['condition'], inplace=True)


Number of species:  2255


In [None]:
mean_df = melted_df.groupby(['species', 'upstream200', 'stress_condition'])['tpm'].mean().reset_index()
mean_df.rename(columns={'tpm': 'mean_tpm'}, inplace=True)
# Select the two columns of interest
subset_df = merged_df[['species', 'upstream200']]

# Drop duplicate rows based on the selected columns
unique_pairs = subset_df.drop_duplicates()

# Count the number of unique pairs
num_unique_pairs = unique_pairs.shape[0]

expected_num_of_tpms = mean_df['stress_condition'].nunique() * num_unique_pairs
print(f'{expected_num_of_tpms} is the expected number of tpms')
print(f'{len(mean_df)} is the actual number of tpms')
print(f'{expected_num_of_tpms - len(mean_df)} is the number of NaNs')
print(f'{len(mean_df)/expected_num_of_tpms * 100}% of the data is present')

33562550 is the expected number of tpms
1124709 is the actual number of tpms
32437841 is the number of NaNs
3.3510832758535924% of the data is present


In [None]:
num_of_zero_tpms = (mean_df['mean_tpm'] == 0).sum()
print(f'{num_of_zero_tpms} is the number of 0s')
print(f'{num_of_zero_tpms/expected_num_of_tpms * 100}% of the original data is 0')
print(f'{num_of_zero_tpms/len(mean_df) * 100}% of the data after dropping nans is 0')

50250 is the number of 0s
0.1497204473438401% of the original data is 0
4.4678223433794875% of the data after dropping nans is 0


In [None]:
mean_df

Unnamed: 0,species,upstream200,stress_condition,mean_tpm
0,Achromobacter xylosoxidans SOLR10,AAAAAAGGCGGGCAGGATGAAGAGCGAACGGCCGCGTCACGGCAGT...,achx_as,0.000000
1,Achromobacter xylosoxidans SOLR10,AAAAAAGGCGGGCAGGATGAAGAGCGAACGGCCGCGTCACGGCAGT...,achx_bs,0.000000
2,Achromobacter xylosoxidans SOLR10,AAAAAAGGCGGGCAGGATGAAGAGCGAACGGCCGCGTCACGGCAGT...,achx_ctrl,0.000000
3,Achromobacter xylosoxidans SOLR10,AAAAAAGGCGGGCAGGATGAAGAGCGAACGGCCGCGTCACGGCAGT...,achx_li,0.000000
4,Achromobacter xylosoxidans SOLR10,AAAAAAGGCGGGCAGGATGAAGAGCGAACGGCCGCGTCACGGCAGT...,achx_mig,0.000000
...,...,...,...,...
1124704,Vibrio cholerae O1 biovar El Tor str. N16961,TTTTTTGACCGCTAATTAAGTGTTACTATACCTCGCTTGTCAGCCA...,vibrio_oss,253.839341
1124705,Vibrio cholerae O1 biovar El Tor str. N16961,TTTTTTGACCGCTAATTAAGTGTTACTATACCTCGCTTGTCAGCCA...,vibrio_oxs,140.473814
1124706,Vibrio cholerae O1 biovar El Tor str. N16961,TTTTTTGACCGCTAATTAAGTGTTACTATACCTCGCTTGTCAGCCA...,vibrio_sp,181.034161
1124707,Vibrio cholerae O1 biovar El Tor str. N16961,TTTTTTGACCGCTAATTAAGTGTTACTATACCTCGCTTGTCAGCCA...,vibrio_tm,293.903547


In [None]:
mean_df.to_csv("/content/drive/MyDrive/MLRG/mean_tpm.csv")