# 1. Mount gDrive that has the folder containing:
- **Genomic data:** folder containing the 120 genome fasta files
- **Phenotypic data:** Metadata.tsv file containing information about biofilm-formation capability of each isolate
(0 --> non-biofilm forming, 1 --> biofilm-forming)

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


# 2. Reading the kmers generated from the genes of interest and reading the isolate metadata.

- The kmers generated from the genes of interest are stored in the *all_gene_kmers.txt* file (Column 1: 31-mer, Column 2: non-zero frequency).

- The isolate IDs and labels can be extracted from the *Metadata.tsv* file (Column 1: Isolate ID, Column 2: Isolate label).

The kmers will be used as column names in the kmer presence/absence matrix.
The isolate metadata will be needed to label the rows of the matrix and to add the 'Label' column at the end of the matrix.


In [None]:
%cd /gdrive


/gdrive


In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
import os
import numpy as np
import pandas as pd

# Change working directory to the drive that was mounted
%cd /gdrive

# Get the path of the folder containing the Genomic and Phenotypic data
directory_path = os.path.join(os.getcwd(), "MyDrive/ML Proj data/RF")
folder_content = os.listdir(directory_path)

kmers = [] # Will store the kmer names to be used as column names when building the matrix
labels = {} # Key: Isolate ID, Value: Isolate label

for file in folder_content:

  # Storing isolate metadata from the Metadata.tsv file
  if 'Metadata.tsv' in file:
    file_path = os.path.join(directory_path, file)
    file = open(file_path, 'r')
    for line in file:
      line = line.rstrip().split('\t')
      labels[line[0]] = line[1]

  # Storing kmers from all_gene_kmers.txt file
  if 'all_gene_kmers.txt' in file:
        file_path = os.path.join(directory_path, file)
        file = open(file_path, 'r')
        for line in file.readlines():
          line = line.rstrip().split()
          kmers.append(line[0])

# To check that the isolate metadata was correctly stored
print('Isolate metadata: ', labels)

# To check that all the kmers were read
print('Number of kmers generated by genes of interest (expected output: 22,435):', len(kmers))

/gdrive
Isolate metadata:  {'CA_001': '1', 'CA_002': '1', 'CA_003': '0', 'CA_005': '1', 'CA_007': '0', 'CA_008': '1', 'CA_009': '1', 'CA_010': '1', 'CA_011': '1', 'CA_012': '1', 'CA_013': '1', 'CA_014': '1', 'CA_015': '1', 'CA_016': '1', 'CA_017': '1', 'CA_018': '1', 'CA_019': '1', 'CA_020': '1', 'CA_021': '1', 'CA_022': '0', 'CA_023': '1', 'CA_024': '0', 'CA_025': '0', 'CA_026': '1', 'CA_027': '1', 'CA_028': '0', 'CA_029': '1', 'CA_030': '1', 'CA_031': '1', 'CA_032': '0', 'CA_033': '1', 'CA_034': '0', 'CA_035': '0', 'CA_036': '1', 'CA_038': '1', 'CA_039': '0', 'CA_040': '1', 'CA_041': '1', 'CA_042': '0', 'CA_043': '1', 'CA_044': '0', 'CA_045': '1', 'CA_046': '0', 'CA_047': '0', 'CA_048': '1', 'CA_049': '0', 'CA_050': '0', 'CA_052': '1', 'CA_053': '1', 'CA_054': '1', 'CA_055': '1', 'CA_057': '1', 'CA_058': '1', 'CA_059': '1', 'CA_060': '1', 'CA_062': '0', 'CA_063': '0', 'CA_064': '1', 'CA_066': '0', 'CA_068': '0', 'CA_070': '1', 'CA_074': '0', 'CA_076': '1', 'CA_078': '1', 'CA_086': '1

# 3. Building Kmer presence/absence dataframe out of the 22,435 kmers of the genes of interest and 120 genomes to be investigated.

- The kmers will form the matrix columns
- The isolates will for the matrix rows
- The cells of the matrix will be filled with 0s and 1s based on the absence/presence of a certain kmer in a certain isolate genome, respectively.

In [None]:
# Initializing dataframe by adding the kmers of interest as column names
df = pd.DataFrame(columns=kmers)

# Getting path containing the genome fasta files of the 120 isolates in order to search the genome fasta files of each
#... isolate for the kmers of interest.
directory_path = os.path.join(os.getcwd(), "MyDrive/ML Proj data/Isolates")
folder_content = os.listdir(directory_path)

# Sorts the filenames stored in the folder_content list so that the isolates are added to the matrix in numerical order
# ...and to ensure that rownames in the matrix have the same order as the labels dict
folder_content=sorted(folder_content)
print('Directory content:', folder_content)

num=1
print('Number\t','Isolate ID\t','Number of kmers found in isolate')

for file in folder_content:

  # Each loop reads 1 genome fasta file
  isolate_name = file[:file.find('.txt')]
  file_path = os.path.join(directory_path, file)
  txt = open(file_path, 'r')

  # For each file, a kmer_dict will be created which will store only the kmers from the genes of interest that were
  # ...found in the genome fasta file of the current loop.
  kmer_dict={} # Key: kmer found in genome fasta file, Value: 1 to signify presence

  # Reading the kmers of current genome fasta file
  for line in txt.readlines():
    line = line.rstrip().split()

    # A kmer from the current genome fasta file is only added to the kmer_dict if it is present in the dataframe columns
    if line[0] in df.columns:
      kmer_dict[line[0]] = 1
  print(f'{num}' + '\t' + f'  {isolate_name}' + '\t\t\t' + f'{len(kmer_dict.keys())}')

  # The kmer dict is used to add the current isolate to the dataframe as a new row
  df.loc[isolate_name]=kmer_dict

  # For all the kmer columns that were not present in the current isolate, these null cells will be filled with 0s
  # ...to signify absence
  df=df.fillna(0)
  num+=1

# Adding isolate IDs as rownames
df.index=labels.keys()

# Adding the 'Label' column
df['Label'] = labels.values()

# Making sure all empty cells are filled with zeros
df=df.fillna(0)

# Checking that there are no null values
print('Are there any null values in the dataframe?', df.isnull().values.any())

# Checking the dataframe has the correct format, rownames, column names, labels and dimensions
print('Final Kmer presence/absence matrix:\n', df.head())
print('Kmer presence/absence dimensions: ', df.shape)

# Writing the kmer presence/absence matrix to files and downloading these files to save the matrix
from google.colab import files

%cd /content
df.to_csv('GeneKmerMatrix.csv', index=True)
df.to_csv('GeneKmerMatrix.tsv', sep='\t', index=True)

files.download('GeneKmerMatrix.csv')
files.download('GeneKmerMatrix.tsv')

Directory content: ['CA_001.txt', 'CA_002.txt', 'CA_003.txt', 'CA_005.txt', 'CA_007.txt', 'CA_008.txt', 'CA_009.txt', 'CA_010.txt', 'CA_011.txt', 'CA_012.txt', 'CA_013.txt', 'CA_014.txt', 'CA_015.txt', 'CA_016.txt', 'CA_017.txt', 'CA_018.txt', 'CA_019.txt', 'CA_020.txt', 'CA_021.txt', 'CA_022.txt', 'CA_023.txt', 'CA_024.txt', 'CA_025.txt', 'CA_026.txt', 'CA_027.txt', 'CA_028.txt', 'CA_029.txt', 'CA_030.txt', 'CA_031.txt', 'CA_032.txt', 'CA_033.txt', 'CA_034.txt', 'CA_035.txt', 'CA_036.txt', 'CA_038.txt', 'CA_039.txt', 'CA_040.txt', 'CA_041.txt', 'CA_042.txt', 'CA_043.txt', 'CA_044.txt', 'CA_045.txt', 'CA_046.txt', 'CA_047.txt', 'CA_048.txt', 'CA_049.txt', 'CA_050.txt', 'CA_052.txt', 'CA_053.txt', 'CA_054.txt', 'CA_055.txt', 'CA_057.txt', 'CA_058.txt', 'CA_059.txt', 'CA_060.txt', 'CA_062.txt', 'CA_063.txt', 'CA_064.txt', 'CA_066.txt', 'CA_068.txt', 'CA_070.txt', 'CA_074.txt', 'CA_076.txt', 'CA_078.txt', 'CA_086.txt', 'CA_088.txt', 'CA_089.txt', 'CA_090.txt', 'CA_091.txt', 'CA_092.txt', 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>