# Changing FASTA formats and Rename the files
# 1. Check the files (FASTA & naming)
#### The dataset is not homogenous, in other words, it has different naming system and different formats, so this jupyter will focus on standadrize the dataset
https://biopython.org/wiki/SeqIO

Download Fasta Validator to make sure that the conversion is successful
```Python
conda install -c bioconda py_fasta_validator
```
or
```Python
pip install py-fasta-validator
```
We will exit with the following return codes:

```0``` this is a valid fasta file

```1``` the first line does not start with a > (rule 1 violated).

```2``` there are duplicate sequence identifiers in the file (rule 7 violated)

```4``` there are characters in a sequence line other than [A-Za-z]

0. Importation of packages:

In [10]:
#!pip install py-fasta-validator

In [11]:
import pandas as pd
from Bio import SeqIO
import glob
import os
import FastaValidator
import subprocess

## A. Change Formates to FASTA

1. Output directory path

In [12]:
#output_folder = input("Output directory:")
output_folder = "/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta"

2. Convert gb to fasta

In [13]:
# Input and output paths
#input_folder = input("genebank directory:")
input_folder = "/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/gb"

# Match all GenBank files in the input folder
genbank_files = glob.glob(os.path.join(input_folder, "*.gb"))

# Convert each GenBank file to FASTA
for gb_file in genbank_files:
    base_name = os.path.splitext(os.path.basename(gb_file))[0]  # Get the base file name
    fasta_file = os.path.join(output_folder, f"{base_name}.fasta")
    
    SeqIO.convert(gb_file, "genbank", fasta_file, "fasta")
    print(f"Converted {gb_file} to {fasta_file}")



Converted /Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/gb/13-00122Rsp.gb to /Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/13-00122Rsp.fasta
Converted /Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/gb/13-00122Rsp copy.gb to /Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/13-00122Rsp copy.fasta


3. Convert txt to fasta

In [14]:
#input_folder = input("genebank directory:")
input_folder = "/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/txt"


def txt_to_fasta(input_file, output_file):

    with open(input_file, 'r') as txt_file, open(output_file, 'w') as fasta_file:
        for line in txt_file:
            line = line.strip()
            if not line:
                continue  # Skip empty lines
            if line.startswith(">"):
                # If line starts with ">", it's a header
                fasta_file.write(f"{line}\n")
            else:
                # Otherwise, treat it as a sequence
                fasta_file.write(f"{line}\n")

# Example usage
text_files = glob.glob(os.path.join(input_folder, "*.txt"))

# Convert each GenBank file to FASTA
for text_file in text_files:
    base_name = os.path.splitext(os.path.basename(text_file))[0]  # Get the base file name
    fasta_file = os.path.join(output_folder, f"{base_name}.fasta")
    
    txt_to_fasta(text_file, fasta_file)
    print(f"Converted {text_file} to {fasta_file}")


Converted /Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/txt/07-00235.txt to /Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/07-00235.fasta
Converted /Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/txt/07-00188.txt to /Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/07-00188.fasta


4. FASTA validator

In [6]:
#fasta_dir = input("FASTA dir:")
fasta_dir = output_folder
output_testfile = "/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/validation_results.txt"

with open(output_testfile, "w") as out_file:
    # Iterate over all .fasta files in the directory
    for fasta_file in glob.glob(os.path.join(fasta_dir, "*.fasta")):
        out_file.write(f"{fasta_file}\n")
        print(f"{fasta_file}")
        
        # Run the py_fasta_validator command
        result = subprocess.run(
            ["py_fasta_validator", "-f", fasta_file],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        
        # Print the validation output (stdout and stderr)
        if result.stdout.strip():
            out_file.write(f"{result.stdout.strip()}\n")
            print(result.stdout.strip())
        if result.stderr.strip():
            out_file.write(f"{result.stderr.strip()}\n")
            print(result.stderr.strip())
        
        # Print the exit code
        out_file.write(f"Exit code: {result.returncode}\n\n")
        print(f"Exit code: {result.returncode}")

/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/13-00122Rsp.fasta
Exit code: 0
/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/13-00122Rsp copy.fasta
Exit code: 0
/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/07-00235.fasta
Exit code: 0
/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/14-00356 R copy.fasta
Exit code: 0
/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/07-00188.fasta
Exit code: 0
/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/14-00356 R.fasta
Exit code: 0


## B. Edit file name

In [15]:
os.chdir("/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/")
print(os.getcwd())
 
for count, f in enumerate(os.listdir()):
    f_name = f.replace(" ","")
 
    new_name = f'{f_name}'
    os.rename(f, new_name)

/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta


## C. Counting the sample names 

In [16]:
import os
import csv
from collections import Counter

# Folder containing the files
folder_dir = "/Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/"

# Extract strings from filenames and count occurrences
def extract_and_count(folder_path):
    string_counts = Counter()
    
    for filename in os.listdir(folder_path):
        # Ensure we are working only with files
        if os.path.isfile(os.path.join(folder_path, filename)):
            # Split filename to extract the string part after numbers
            # Assuming the format is similar to "13-00125Rspcopy.fasta"
            base_name, _ = os.path.splitext(filename)  # Remove file extension
            parts = base_name.split("-", 1)  # Split at the first dash
            
            if len(parts) > 1:  # Ensure there is a string part after the dash
                string_part = ''.join(filter(str.isalpha, parts[1]))  # Keep only the string part
                string_counts[string_part] += 1

    return string_counts

# Write the counts to a CSV file
def write_to_csv(output_file, counts):
    with open(output_file, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["String", "Count"])  # Write header
        
        for string, count in counts.items():
            writer.writerow([string, count])  # Write each string and its count

# Main execution
if __name__ == "__main__":
    counts = extract_and_count(folder_dir)
    output_csv = os.path.join(folder_dir, "string_counts.csv")
    write_to_csv(output_csv, counts)
    print(f"String counts saved to {output_csv}")


String counts saved to /Users/MiladM-Dev/Documents/1PhD/Dataset_trials/D8_mimic/fasta/string_counts.csv
