In [None]:
pip install pandas cyvcf2


Collecting cyvcf2
  Downloading cyvcf2-0.30.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
Collecting coloredlogs (from cyvcf2)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting humanfriendly>=9.1 (from coloredlogs->cyvcf2)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: humanfriendly, coloredlogs, cyvcf2
Successfully installed coloredlogs-15.0.1 cyvcf2-0.30.28 humanfriendly-10.0


In [None]:
import pandas as pd
import numpy as np
from cyvcf2 import VCF

# Function to convert VCF to DataFrame
def vcf_to_dataframe(vcf_path):
    vcf_reader = VCF(vcf_path)
    records = []

    for variant in vcf_reader:
        record = {
            'reference': variant.CHROM,
            'position': variant.POS,
            'id': variant.ID,
            'ref': variant.REF,
            'alt': ','.join(variant.ALT),
            'quality': variant.QUAL,
            'filter': variant.FILTER,
            'info': variant.INFO,
        }
        records.append(record)

    return pd.DataFrame(records)

# Example usage
vcf_path = '/content/variants.vcf'
df = vcf_to_dataframe(vcf_path)

print(df.head())

                         reference  position    id ref alt    quality filter  \
0  concated_reference_all1815genes       102  None   A   C  10.985200   None   
1  concated_reference_all1815genes       158  None   G   A  18.682699   None   
2  concated_reference_all1815genes       163  None   G   A  14.893800   None   
3  concated_reference_all1815genes       255  None   C   T  57.000000   None   
4  concated_reference_all1815genes       269  None   C   T  52.000000   None   

                                            info  
0  <cyvcf2.cyvcf2.INFO object at 0x7b44c9e1de60>  
1  <cyvcf2.cyvcf2.INFO object at 0x7b44c9e1f960>  
2  <cyvcf2.cyvcf2.INFO object at 0x7b44c9e1e4c0>  
3  <cyvcf2.cyvcf2.INFO object at 0x7b44c9e1d950>  
4  <cyvcf2.cyvcf2.INFO object at 0x7b44c9e1f9c0>  


In [None]:
def label_samples_from_vcf(vcf_path):
    labels = []

    with open(vcf_path, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue

            columns = line.strip().split('\t')

            best_label = None
            lowest_number = float('inf')

            for index, sample in enumerate(columns[9:], start=1):
                parts = sample.split(':')
                genotype = parts[0]

                if genotype == './.':  # Skip missing data samples
                    continue

                numbers = parts[1].split(',')
                valid_numbers = [int(num) for num in numbers if num.isdigit() and int(num) > 0]

                if valid_numbers:  # Proceed if there are valid, non-zero numbers
                    min_number = min(valid_numbers)
                    if min_number < lowest_number:
                        lowest_number = min_number
                        best_label = index

            # If all samples are missing or zero, assign a default label (-1)
            if best_label is None:
                best_label = -1

            labels.append(best_label)

    return labels

labels = label_samples_from_vcf(vcf_path)
df['Best_Sample_Label'] = labels

# Now 'df' has an additional column 'Best_Sample_Label' with the labels
print(df.head())
print(labels)

                         reference  position    id ref alt    quality filter  \
0  concated_reference_all1815genes       102  None   A   C  10.985200   None   
1  concated_reference_all1815genes       158  None   G   A  18.682699   None   
2  concated_reference_all1815genes       163  None   G   A  14.893800   None   
3  concated_reference_all1815genes       255  None   C   T  57.000000   None   
4  concated_reference_all1815genes       269  None   C   T  52.000000   None   

                                            info  Best_Sample_Label  
0  <cyvcf2.cyvcf2.INFO object at 0x7b44c9e1de60>                  5  
1  <cyvcf2.cyvcf2.INFO object at 0x7b44c9e1f960>                  5  
2  <cyvcf2.cyvcf2.INFO object at 0x7b44c9e1e4c0>                  5  
3  <cyvcf2.cyvcf2.INFO object at 0x7b44c9e1d950>                  8  
4  <cyvcf2.cyvcf2.INFO object at 0x7b44c9e1f9c0>                  8  
[5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 5, 2, 6, 6, 4, 3, 4,

In [None]:
def process_vcf(vcf_path):
    # Open VCF file
    with open(vcf_path, 'r') as file:
        # Initialize a list to collect a dictionary for each row
        rows = []

        for line in file:
            if line.startswith('#'):
                continue  # Skip header lines

            columns = line.strip().split('\t')

            # Initialize a dictionary for the current row
            row_dict = {}

            # Iterate over each sample starting from the 10th column
            for i, sample in enumerate(columns[9:], start=1):
                parts = sample.split(':')
                genotype = parts[0]
                numbers = parts[1].split(',') if len(parts) > 1 else [None, None, None]

                # Fill in missing numbers with None
                numbers += [None] * (3 - len(numbers))

                # Assigning to the row dictionary
                row_dict[f'Genotype_{i}'] = genotype
                row_dict[f'FirstNum_{i}'] = int(numbers[0]) if numbers[0] is not None else None
                row_dict[f'SecondNum_{i}'] = int(numbers[1]) if numbers[1] is not None else None
                row_dict[f'ThirdNum_{i}'] = int(numbers[2]) if numbers[2] is not None else None

            # Append the row dictionary to the rows list
            rows.append(row_dict)

    # Create a DataFrame from the list of dictionaries
    new_df = pd.DataFrame(rows)

    # Convert genotype strings to numerical categories if necessary
    # This step assumes that the genotype needs to be numerical.
    # If you want to keep the genotype as strings, you can comment out this step.
    genotype_mapping = {'0/0': 0, '0/1': 1, '1/0': 1, '1/1': 2}
    for col in new_df.filter(like='Genotype_').columns:
        new_df[col] = new_df[col].map(genotype_mapping)

    return new_df

# Usage
new_df = process_vcf(vcf_path)

# Handle potential NaN values resulting from empty lists or lists with only None
new_df.fillna(0, inplace=True)

# Assuming `labels` are available and match the row count of `new_df`
new_df['True_Labels'] = labels

print(new_df.head())

   Genotype_1  FirstNum_1  SecondNum_1  ThirdNum_1  Genotype_2  FirstNum_2  \
0         0.0           0            0           0         0.0           0   
1         0.0           0            0           0         0.0           0   
2         0.0           0            0           0         0.0           0   
3         0.0           0            0           0         0.0           0   
4         0.0           0            0           0         0.0           0   

   SecondNum_2  ThirdNum_2  Genotype_3  FirstNum_3  ...  ThirdNum_6  \
0            0           0         0.0           0  ...           0   
1            0           0         0.0           0  ...           0   
2            0           0         0.0           0  ...           0   
3            0           0         0.0           0  ...           0   
4            0           0         0.0           0  ...          46   

   Genotype_7  FirstNum_7  SecondNum_7  ThirdNum_7  Genotype_8  FirstNum_8  \
0         2.0           9 

In [None]:
from google.colab import files

# Convert the DataFrame to a CSV file
new_df.to_csv('new_df.csv', index=False)

# Download the file to your local machine
files.download('new_df.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>