In [1]:
import pandas as pd

import os
import re

from sklearn import preprocessing
from Bio import SeqIO

In [2]:
def read_csv_file(file_path):
    try:
        return pd.read_csv(file_path, delimiter=';', encoding='utf-8', low_memory=False)
    except pd.errors.ParserError:
        print(f"Error parsing {file_path}")
        return pd.DataFrame()

In [3]:
data_path = f'{os.getcwd()}/data/data_expression'
all_files = [os.path.join(data_path, file) for file in os.listdir(data_path) if file.endswith('.csv')]

df_list = [read_csv_file(file) for file in all_files]
df = pd.concat(df_list, ignore_index=True)

In [4]:
category_mappings = {}
multi_categories = ["Respiration", "Gram_St", "Chromosome", "Species"]
binary_categories = ["Oxidase", "Catalase"]

intersection = set(binary_categories).intersection(set(multi_categories))

while intersection:
    multi_categories.remove(intersection.pop())

le = preprocessing.LabelEncoder()
for category in multi_categories:
    df[category] = le.fit_transform(df[category])
    category_mappings[category] = dict(zip(le.classes_, le.transform(le.classes_)))


lb = preprocessing.LabelBinarizer()
for category in binary_categories:
    # Fit LabelBinarizer only on non-NaN values and transform them
    non_nan_mask = df[category].notna()
    encoded_values = lb.fit_transform(df.loc[non_nan_mask, category])

    # Convert the result to a DataFrame to match the original index
    encoded_df = pd.DataFrame(encoded_values, index=df[non_nan_mask].index, columns=[category])

    # Combine the encoded values with the original DataFrame, preserving NaNs
    df[category] = encoded_df[category]
    category_mappings[category] = dict(zip(le.classes_, le.transform(le.classes_)))

 
df.reset_index(drop=True, inplace=True)

In [5]:
# Extract and clean regions
df['Is_Complement'] = df['Region'].str.contains('complement')
df['Region_Clean'] = df['Region'].str.replace('complement\(|\)', '', regex=True)
df['Is_circular'] = df['Region_Clean'].str.contains('join')
df['Region_Clean'] = df['Region_Clean'].str.replace('join\(|\)', '', regex=True)

# Initialize dictionary to store results for each species
species_complements = {}

# Group by species
grouped = df.groupby('Species')

# Process each group
for species, group in grouped:
    regions = group.loc[~group['Is_Complement'], 'Region_Clean'].unique()
    complements = group.loc[group['Is_Complement'], 'Region_Clean'].unique()
    
    # It is important to check that there are no regions that are both complements and not complements!!!
    assert len([region for region in regions if region in complements]) == 0
    
    # Store results for this species
    species_complements[species] = {
        'Regions_without_complements': [region for region in regions if region not in complements],
        'Complements_only': [complement for complement in complements if complement not in regions]
    }

# Print the results
for species, results in species_complements.items():
    print(f"Species: {species}")
    print("  Regions without complements:", results['Regions_without_complements'])
    print("  Complements only:", results['Complements_only'])
    print()


  df['Is_Complement'] = df['Region'].str.contains('complement')
  df['Region_Clean'] = df['Region'].str.replace('complement\(|\)', '', regex=True)
  df['Is_circular'] = df['Region_Clean'].str.contains('join')


Species: 0
  Regions without complements: ['6070..7080', '7132..8139', '8177..9859', '9879..10640', '15327..16208', '26536..27702', '33699..35090', '35782..37119', '38639..40225', '40248..41222', '41219..42136', '42148..43143', '43140..44156', '44289..45491', '45717..46631', '46663..46848', '46961..47887', '52915..54375', '54372..55487', '55583..58030', '59458..60354', '61645..62331', '62447..64039', '64079..64783', '64807..65067', '65098..67092', '67104..67559', '68738..69646', '69751..70503', '70512..71201', '71253..71987', '71993..73051', '73092..73997', '74540..75436', '75646..76809', '78304..79374', '79376..80263', '80279..81073', '81111..81440', '81437..82816', '82813..83985', '83982..85049', '85140..86078', '86455..87288', '88428..89408', '89449..91506', '91499..93229', '94245..95276', '95333..96388', '96385..97251', '97253..98053', '98053..99210', '99194..99502', '99499..100944', '100954..102117', '102114..102500', '108887..109801', '109927..110853', '111020..111631', '111789..

In [6]:
# Function to parse region and extract start, end, and length
def parse_region(region):
    start, *intermediate, end = region.split('..')
    if not intermediate:
        end = int(end)
        start = int(start)
        length = end - start + 1
        return start, end, length
    else:
        first_end, second_start = intermediate[0].split(',')   
        length1 = int(first_end) - int(start) + 1
        length2 = int(end) - int(second_start) + 1
        return int(start), int(end), length1 + length2

# Apply the function to the DataFrame
df[['Region_Start', 'Region_End', 'Region_Length']] = df['Region_Clean'].apply(lambda x: pd.Series(parse_region(x)))

  df[['Region_Start', 'Region_End', 'Region_Length']] = df['Region_Clean'].apply(lambda x: pd.Series(parse_region(x)))
  df[['Region_Start', 'Region_End', 'Region_Length']] = df['Region_Clean'].apply(lambda x: pd.Series(parse_region(x)))
  df[['Region_Start', 'Region_End', 'Region_Length']] = df['Region_Clean'].apply(lambda x: pd.Series(parse_region(x)))


In [7]:
df

Unnamed: 0,No,Respiration,Oxidase,Catalase,Gram_St,Species,Chromosome,Region,New_locus_tag,Old_locus_tag,...,NGON_Vic_3 (GE) - Total counts,NGON_Vic_3 (GE) - RPKM,NGON_Vic_3 (GE) - TPM,NGON_Vic_3 (GE) - CPM,Is_Complement,Region_Clean,Is_circular,Region_Start,Region_End,Region_Length
0,91338,2,0.0,0,2,2252,39,1..1362,SPD_RS00005,SPD_0001,...,,,,,False,1..1362,False,1,1362,1362
1,91339,2,0.0,0,2,2252,39,1521..2657,SPD_RS00010,SPD_0002,...,,,,,False,1521..2657,False,1521,2657,1137
2,91340,2,0.0,0,2,2252,39,2722..2916,SPD_RS00015,SPD_0003,...,,,,,False,2722..2916,False,2722,2916,195
3,91341,2,0.0,0,2,2252,39,3000..4115,SPD_RS00020,SPD_0004,...,,,,,False,3000..4115,False,3000,4115,1116
4,91342,2,0.0,0,2,2252,39,4186..4755,SPD_RS00025,SPD_0005,...,,,,,False,4186..4755,False,4186,4755,570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105083,63547,0,1.0,1,1,2243,29,complement(2151190..2152827),,NGO2178,...,3696.0,351.569874,424.146435,954.527459,True,2151190..2152827,False,2151190,2152827,1638
105084,63548,0,1.0,1,1,2243,29,2152852..2152986,,NGO2179,...,12.0,13.849722,16.708799,3.099115,False,2152852..2152986,False,2152852,2152986,135
105085,63549,0,1.0,1,1,2243,29,complement(2153000..2153221),,NGO2180,...,275.0,193.007110,232.850661,71.021388,True,2153000..2153221,False,2153000,2153221,222
105086,63550,0,1.0,1,1,2243,29,complement(2153286..2153651),,NGO2181,...,530.0,225.625599,272.202770,136.877585,True,2153286..2153651,False,2153286,2153651,366


In [8]:
def read_csv_file(file_path):
    try:
        return pd.read_csv(file_path, delimiter=';', encoding='utf-8', low_memory=False)
    except pd.errors.ParserError:
        print(f"Error parsing {file_path}")
        return pd.DataFrame()

def parse_fna_file(filepath):
    return list(SeqIO.parse(filepath, "fasta"))

def parse_gff_file(filepath):
    return pd.read_csv(filepath, sep="\t", comment='#', header=None,
                       names=["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"])

def extract_gc_content(sequence):
    seq_str = str(sequence.seq)
    gc_content = (seq_str.count('G') + seq_str.count('C')) / len(seq_str)
    return gc_content

def calculate_rpkm(counts, length_kb, total_reads_million):
    return counts / (length_kb * total_reads_million)

def calculate_tpm(df):
    df['RPK'] = df['counts'] / df['length_kb']
    sum_rpk = df['RPK'].sum()
    df['TPM'] = (df['RPK'] * 1e6) / sum_rpk
    return df


In [9]:
data_path = f"{os.getcwd()}/data/data_sequences/"
fna_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.fna')]
gff_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.gff')]

all_features = []

for fna_file in fna_files:
    sequences = parse_fna_file(fna_file)
    for seq in sequences:
        gc_content = extract_gc_content(seq)
        try:
            species, sequence_type = seq.description[len(seq.id)+1:].split(", ")
        except:
            species = "Achromobacter xylosoxidans strain SOLR10"
            if seq.description.endswith("chromosome"):
                sequence_type = "chromosome"
            else:
                sequence_type = "plasmoid"
                
        file_name = os.path.basename(fna_file)[:-4]
        region, region_id = file_name.split(".")
        chromosome, chromosome_id = seq.id.split(".")
        all_features.append({"Region":region, "Region_ID": region_id,
                             "Chromosome":chromosome, "Chromosome_ID": chromosome_id,
                             'Species': species, "Sequence_Type": sequence_type,
                             'gc_content': gc_content, 'length_kb': len(seq) / 1000})

features_df = pd.DataFrame(all_features)

# Assuming you have a corresponding GFF file for each FNA file
for gff_file in gff_files:
    gff_data = parse_gff_file(gff_file)
    # Here, you can integrate gff_data with features_df if needed
    # Example: Merging based on 'seq_id' or other relevant columns

# total_reads_million = features_df['gc_content'].sum() / 1e6  # Adjust this based on actual read counts
# features_df['RPKM'] = calculate_rpkm(features_df['gc_content'], features_df['length_kb'], total_reads_million)
# features_df = calculate_tpm(features_df)

# print(features_df.head())

In [10]:
# Function to split and parse the string
def parse_column_values(column_value):
    key_value_pairs = column_value.split(';')
    parsed_dict = {}
    for pair in key_value_pairs:
        print(pair)
        key, val = pair.split('=')
        parsed_dict[key] = val
    return parsed_dict

def split_complex_values(value):
    # Initialize the parts with None
    part1, part2, part3, part4 = None, None, None, None

    if value:
        # Split based on '-'
        if '-' in value:
            part1, remainder = value.split('-', 1)
        else:
            part1, remainder = None, value

        # Split based on ':'
        if remainder and ':' in remainder:
            part2, remainder = remainder.split(':', 1)
        else:
            part2, remainder = remainder, ""

        # Split based on '..'
        if remainder:
            part4, part5 = remainder.split('..', 1)
        else:
            part4, part5 = None, None
        
        # Split based on '.'
        if part2 and '.' in part2:
            part2, part3 = part2.split('.', 1)
        else:
            part2, part3 = part2, None

    return part1, part2, part3, part4, part5


# Applying the function to the DataFrame column
parsed_data = gff_data['attributes'].apply(parse_column_values)

# Creating a new DataFrame with expanded columns
expanded_df = pd.DataFrame(parsed_data.tolist())

# Applying the complex value split to the 'ID' column
expanded_df[['seq_type', 'Chromosome', 'Chromosome_ID', 'Region_Start', 'Region_End']] = expanded_df['ID'].apply(lambda x: pd.Series(split_complex_values(x)))

# Dropping the original 'ID' column after expansion
expanded_df.drop(columns=['ID'], inplace=True)

# Concatenating with the original DataFrame
gff_data = pd.concat([gff_data, expanded_df], axis=1)

ID=NC_008767.1:1..2194961
Dbxref=taxon:272831
Is_circular=true
Name=ANONYMOUS
country=USA
gbkey=Src
genome=chromosome
mol_type=genomic DNA
serotype=C
strain=FAM18
ID=gene-NMC_RS00010
Name=lpxC
gbkey=Gene
gene=lpxC
gene_biotype=protein_coding
locus_tag=NMC_RS00010
old_locus_tag=NMC0001
ID=cds-WP_002220060.1
Parent=gene-NMC_RS00010
Dbxref=GenBank:WP_002220060.1
Name=WP_002220060.1
Ontology_term=GO:0009245,GO:0008759
gbkey=CDS
gene=lpxC
go_function=UDP-3-O-[3-hydroxymyristoyl] N-acetylglucosamine deacetylase activity|0008759||IEA
go_process=lipid A biosynthetic process|0009245||IEA
inference=COORDINATES: similar to AA sequence:RefSeq:WP_002246555.1
locus_tag=NMC_RS00010
product=UDP-3-O-acyl-N-acetylglucosamine deacetylase
protein_id=WP_002220060.1
transl_table=11
ID=gene-NMC_RS00015
Name=NMC_RS00015
end_range=3610,.
gbkey=Gene
gene_biotype=pseudogene
locus_tag=NMC_RS00015
old_locus_tag=NMC0002
partial=true
pseudo=true
ID=cds-NMC_RS00015
Parent=gene-NMC_RS00015
Note=incomplete%3B partial i

In [11]:
len(gff_data), gff_data["country"].isna().sum()

(4453, np.int64(4452))

In [12]:
len(gff_data), gff_data["genome"].isna().sum()

(4453, np.int64(4452))

In [13]:
len(gff_data), gff_data["regulatory_class"].isna().sum()

(4453, np.int64(4448))

In [14]:
len(gff_data), gff_data["mol_type"].isna().sum()

(4453, np.int64(4452))

In [15]:
len(gff_data), gff_data["bound_moiety"].isna().sum()

(4453, np.int64(4448))

In [16]:
len(gff_data), gff_data["serotype"].isna().sum()

(4453, np.int64(4452))

In [17]:
len(gff_data), gff_data["exception"].isna().sum()

(4453, np.int64(4451))

In [18]:
len(gff_data), gff_data["gene_synonym"].isna().sum()

(4453, np.int64(4448))

In [19]:
gff_data.drop(columns=['attributes', "Dbxref", "score",
                       "country", "genome",
                       "regulatory_class", "mol_type",
                       "bound_moiety", "serotype",
                       "exception", "gene_synonym"], inplace=True)

In [20]:
gff_data['Is_Complement'] = gff_data['strand'].map({'+': True, '-': False})
gff_data.drop(columns=['strand'], inplace=True)
# Map "true" to True and "false" to False
gff_data['Is_circular'] = gff_data['Is_circular'].map({'true': True, 'false': False})
# Replace NaN values with False
gff_data['Is_circular'] = gff_data['Is_circular'].fillna(False)
gff_data["partial"] = gff_data["partial"].fillna(False)

  gff_data['Is_circular'] = gff_data['Is_circular'].fillna(False)


In [21]:
gff_data.columns

Index(['seqid', 'source', 'type', 'start', 'end', 'phase', 'Is_circular',
       'Name', 'gbkey', 'strain', 'gene', 'gene_biotype', 'locus_tag',
       'old_locus_tag', 'Parent', 'Ontology_term', 'go_function', 'go_process',
       'inference', 'product', 'protein_id', 'transl_table', 'end_range',
       'partial', 'pseudo', 'Note', 'start_range', 'go_component', 'anticodon',
       'seq_type', 'Chromosome', 'Chromosome_ID', 'Region_Start', 'Region_End',
       'Is_Complement'],
      dtype='object')

In [22]:
gff_data

Unnamed: 0,seqid,source,type,start,end,phase,Is_circular,Name,gbkey,strain,...,Note,start_range,go_component,anticodon,seq_type,Chromosome,Chromosome_ID,Region_Start,Region_End,Is_Complement
0,NC_008767.1,RefSeq,region,1,2194961,.,True,ANONYMOUS,Src,FAM18,...,,,,,,NC_008767,1,1,2194961,True
1,NC_008767.1,RefSeq,gene,1261,2184,.,False,lpxC,Gene,,...,,,,,gene,NMC_RS00010,,,,False
2,NC_008767.1,Protein Homology,CDS,1261,2184,0,False,WP_002220060.1,CDS,,...,,,,,cds,WP_002220060,1,,,False
3,NC_008767.1,RefSeq,pseudogene,3341,3610,.,False,NMC_RS00015,Gene,,...,,,,,gene,NMC_RS00015,,,,False
4,NC_008767.1,Protein Homology,CDS,3341,3610,0,False,,CDS,,...,incomplete%3B partial in the middle of a conti...,,,,cds,NMC_RS00015,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4448,NC_008767.1,Protein Homology,CDS,2191558,2192007,0,False,WP_002221764.1,CDS,,...,,,,,cds,WP_002221764,1,,,False
4449,NC_008767.1,RefSeq,gene,2192048,2193319,.,False,waaA,Gene,,...,,,,,gene,NMC_RS11385,,,,False
4450,NC_008767.1,Protein Homology,CDS,2192048,2193319,0,False,WP_041423265.1,CDS,,...,,,,,cds,WP_041423265,1,,,False
4451,NC_008767.1,RefSeq,gene,2193382,2194830,.,False,gnd,Gene,,...,,,,,gene,NMC_RS11390,,,,False
