In [3]:
import pandas as pd

## Getting the remainder of accessions for downstream Analysis

In [1]:
import os


def compare_accessions(accessions_txt, csv_file):
    """Compares accessions from a text file with those in a CSV file and writes results to separate text files.

    Args:
        accessions_txt (str): Path to the accessions text file.
        csv_file (str): Path to the CSV file.
    """

    # Read accessions from the text file
    with open(accessions_txt, "r") as f:
        accessions_from_txt = [line.strip() for line in f]

    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    # Get accessions from the DataFrame
    accessions_from_csv = df["Accession"].tolist()

    # Find common accessions
    accessions_in_both = set(accessions_from_txt).intersection(set(accessions_from_csv))

    # Find accessions only in the text file
    accessions_only_in_txt = set(accessions_from_txt).difference(
        set(accessions_from_csv)
    )

    # Find accessions only in the CSV file
    accessions_only_in_csv = set(accessions_from_csv).difference(
        set(accessions_from_txt)
    )

    # Write results to separate text files
    with open("accessions_in_both.txt", "w") as f:
        f.writelines(acc + "\n" for acc in accessions_in_both)

    with open("accessions_only_in_txt.txt", "w") as f:
        f.writelines(acc + "\n" for acc in accessions_only_in_txt)

    with open("accessions_only_in_csv.txt", "w") as f:
        f.writelines(acc + "\n" for acc in accessions_only_in_csv)


# Example usage:
accessions_txt_file = "online_accessions.txt"
csv_file = "whole_africa_1990_till_date.csv"

compare_accessions(accessions_txt_file, csv_file)

In [2]:
df = pd.read_csv("../data/processed/whole_africa_genome.csv")

In [3]:
df.head()

Unnamed: 0,#,Se ID,Patient Code,PAT id(SSAM),Accession,Name,Subtype,Country,Sampling Year,Georegion,HXB2/MAC239 start,HXB2/MAC239 stop,Sequence Length,Organism
0,1,114321,NJ97-42,24045.0,AB049811,97GH-AG1,02_AG,GHANA,1997.0,AFR SSA,1,9690,9748,HIV-1
1,2,208491,1116,3526.0,AB050905,99ZM1116,C,ZAMBIA,1999.0,AFR SSA,7023,7307,279,HIV-1
2,3,208490,12,3527.0,AB050906,99ZM12,C,ZAMBIA,1999.0,AFR SSA,7023,7308,279,HIV-1
3,4,208489,13,3528.0,AB050907,99ZM13,C,ZAMBIA,1999.0,AFR SSA,7023,7309,279,HIV-1
4,5,208488,18,3529.0,AB050908,99ZM18,C,ZAMBIA,1999.0,AFR SSA,7023,7307,279,HIV-1


## Splitters and Validators

In [6]:
# Split some things into specific fragments
import pandas as pd


def split_csv_into_fragments(csv_file, output_dir, fragment_size=16000):
    """Splits a CSV file into fragments of a specified size.

    Args:
        csv_file (str): Path to the input CSV file.
        output_dir (str): Directory to save the output fragments.
        fragment_size (int): Size of each fragment (number of lines).
    """

    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    # Extract the accessions column
    accessions = df["Accession"].tolist()

    # Calculate the number of fragments needed
    num_fragments = len(accessions) // fragment_size + 1

    # Split the accessions into fragments
    for i in range(num_fragments):
        start_index = i * fragment_size
        end_index = min((i + 1) * fragment_size, len(accessions))
        fragment = accessions[start_index:end_index]

        # Write the fragment to a text file
        output_file = f"fragment_{i+1}.txt"
        with open(os.path.join(output_dir, output_file), "w") as f:
            f.writelines(acc + "\n" for acc in fragment)


# Example usage:
csv_file = "whole_africa_1990_till_date.csv"
output_dir = "../data/processed/output_fragments"

split_csv_into_fragments(csv_file, output_dir)

In [1]:
import re


def validate_file(filename):
    """Validates a text file based on the given requirements.

    Args:
        filename (str): The path to the file to validate.

    Returns:
        bool: True if the file passes validation, False otherwise.
    """

    # Check file size
    file_size = os.path.getsize(filename)
    if file_size > 20 * 1024 * 1024:  # 20 MB in bytes
        print("Error: File size exceeds 20 MB.")
        return False

    # Check file format
    if not filename.endswith(".txt"):
        print("Error: File format must be .txt.")
        return False

    # Check character and symbol restrictions
    with open(filename, "r") as f:
        for line in f:
            if not re.match(r"^[A-Za-z0-9_]+$", line.strip()):
                print("Error: Invalid characters found in the file.")
                return False

    return True


# Example usage:
file_path = "../data/processed/output_fragments/fragment_13.txt"
if validate_file(file_path):
    print("File validation passed.")
else:
    print("File validation failed.")

File validation passed.


In [3]:
def split_text_file(input_file, output_dir, fragment_size=1000):
    """Splits a text file into fragments of a specified size.

    Args:
        input_file (str): Path to the input text file.
        output_dir (str): Directory to save the output fragments.
        fragment_size (int): Size of each fragment (number of lines).
    """

    with open(input_file, "r") as f:
        lines = f.readlines()

    num_fragments = len(lines) // fragment_size + 1

    for i in range(num_fragments):
        start_index = i * fragment_size
        end_index = min((i + 1) * fragment_size, len(lines))
        fragment = lines[start_index:end_index]

        output_file = f"newfragments_{i+1}.txt"
        with open(os.path.join(output_dir, output_file), "w") as f:
            f.writelines(fragment)


# Example usage:
text_file = "../data/processed/output_fragments/newfragment_1.txt"
output_dir = "../data/processed/output_fragments"

split_text_file(text_file, output_dir)

In [4]:
df = pd.read_csv("../data/metadata.csv")

  df = pd.read_csv('../data/metadata.csv')


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231227 entries, 0 to 231226
Data columns (total 47 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   #                          231227 non-null  int64  
 1   SE id(SA)                  231227 non-null  object 
 2   Accession                  231227 non-null  int64  
 3   GI number                  231227 non-null  object 
 4   Version                    231227 non-null  int64  
 5   Se ID                      231227 non-null  int64  
 6   Sequence Length            136566 non-null  object 
 7   GB comment                 231227 non-null  object 
 8   GB create date             231227 non-null  object 
 9   GB update date             231227 non-null  float64
 10  Percent non-ACGT           231227 non-null  int64  
 11  SE id(SSAM)                171830 non-null  float64
 12  PAT id(SSAM)               231226 non-null  object 
 13  Name                       23

In [7]:
import pandas as pd


def optimize_dataframe(df):
    # Downcast numeric types to the most efficient representation
    for col in df.select_dtypes(include=["float"]):
        df[col] = pd.to_numeric(df[col], downcast="float")

    for col in df.select_dtypes(include=["int"]):
        df[col] = pd.to_numeric(df[col], downcast="integer")

    # Convert object types to category if they have few unique values
    for col in df.select_dtypes(include=["object"]).columns:
        unique_values = df[col].nunique()
        total_values = len(df[col])
        if (
            unique_values / total_values < 0.5
        ):  # If more than 50% of values are repetitive
            df[col] = df[col].astype("category")

    return df


# Optimize DataFrame
df_optimized = optimize_dataframe(df)

# Check memory usage before and after optimization
print(f"Original memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(
    f"Optimized memory usage: {df_optimized.memory_usage(deep=True).sum() / 1024**2:.2f} MB"
)

Original memory usage: 109.89 MB
Optimized memory usage: 109.89 MB


In [8]:
df_optimized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231227 entries, 0 to 231226
Data columns (total 47 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   #                          231227 non-null  int32   
 1   SE id(SA)                  231227 non-null  object  
 2   Accession                  231227 non-null  int64   
 3   GI number                  231227 non-null  object  
 4   Version                    231227 non-null  int32   
 5   Se ID                      231227 non-null  int16   
 6   Sequence Length            136566 non-null  category
 7   GB comment                 231227 non-null  category
 8   GB create date             231227 non-null  category
 9   GB update date             231227 non-null  float32 
 10  Percent non-ACGT           231227 non-null  int32   
 11  SE id(SSAM)                171830 non-null  float32 
 12  PAT id(SSAM)               231226 non-null  object  
 13  Name          

In [9]:
df_optimised.to_csv("../data/optimied_metadata.csv")

NameError: name 'df_optimised' is not defined