This just parses fastq files, extracts barcodes and makes a file with barcode counts

The number of rows in the output csv should be the number of unique barcodes


In [1]:
# Import packages

import numpy as np
import itertools
import math
import pandas as pd
import time
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
import matplotlib
import matplotlib.pyplot as plt
from ast import literal_eval
import csv
import venn
from numpy import cov
from scipy.stats import spearmanr


In [2]:
# This information has to be input manually
# File names, barcode flank sequences

SSFlank = 'GGTGGTGACC'

refBCs = [['ACAGATAATGACTGT', 'WT'], 
          ['GCAATCAAAGATCTG', 'WT'], 
          ['CCCTTTGGACGGCTG', 'WT'], 
          ['GACATTATGTTCAAA', 'K191A'], 
          ['GACCGCGGATACCAA', 'K191A'], 
          ['TAGCACTTCCCGCAC', 'K191A']]

experimentName = 'NP_11_27_selection_first'

np112723Data = '/Volumes/NP_DFS_4TB/NPDFS01/NP-11-27-23_S3_L001_R1_001.fastq'
np112724Data = '/Volumes/NP_DFS_4TB/NPDFS01/NP-11-27-24_S4_L001_R1_001.fastq'
np112725Data = '/Volumes/NP_DFS_4TB/NPDFS01/NP-11-27-25_S5_L001_R1_001.fastq'

listOfFiles = [np112723Data, np112724Data, np112725Data]
listOfFileNames = ['NP_11_27_23', 'NP_11_27_24', 'NP_11_27_25']
listOfLibTypes = ['SS', 'SS', 'SS']

MINIMUM_READS = 5



In [3]:
# Parse and extract barcode, make csv files for each one

def parseAndExtractBC(file, libtype):
    BCFlank = ''
    if libtype == 'SS':
        BCFlank = SSFlank
    if libtype == 'SD':
        BCFlank = SDFlank
    if libtype == 'DS':
        BCFlank = DSFlank
    parsedFile = SeqIO.parse(file, "fastq")
    BClist = []
    
    for readCount, rec in enumerate(parsedFile):
        if readCount % 10000000 == 0:
            print(readCount)
        readSeq = str(rec.seq)
        BCloc = readSeq.find(BCFlank)
        if BCloc == -1:
            BClist.append('No flank found')
        else:
            BC = readSeq[BCloc + len(BCFlank):BCloc + len(BCFlank) + 15]
            if 'N' in BC:
                BClist.append('BC has Ns')
            else:
                BClist.append(BC)
    return BClist

# These files are big, save them on a drive
for i, file in enumerate(listOfFiles):
    print(listOfFileNames[i])
    dfBClist = pd.DataFrame(parseAndExtractBC(file, listOfLibTypes[i]), columns = [listOfFileNames[i]])
    dfBClist.to_csv('/Volumes/NP_DFS_4TB/NPDFS01/' + listOfFileNames[i] + '_BClist.csv')
    
    

NP_11_27_23


FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/NP_DFS_4TB/NPDFS01/NP-11-27-23_S3_L001_R1_001.fastq'

In [None]:
# Collapse down to smaller list with reads per barcode measured

for i, file in enumerate(listOfFiles):
    dfBCs = pd.read_csv('/Volumes/NP_DFS_4TB/NPDFS01/' + listOfFileNames[i] + '_BClist.csv')
    dfBCCounts = dfBCs[listOfFileNames[i]].value_counts()
    dfBCCounts.to_csv(listOfFileNames[i] + '_BCCounts.csv', index = False)

In [8]:
# This is the PacBio data that made it through all of the filters
# 'correctedAAmut' might have a different name eventually
lookupTable = pd.read_csv('NP_11_21_1_correctedLookupTable.csv', usecols = ('correctedAAmut','Barcode'))

# Add in WT and K191A barcodes (specified above)
dfRefs = pd.DataFrame(refBCs, columns = lookupTable.columns)
lookupTable = lookupTable.append(dfRefs)

combinedDF = lookupTable

for i, experiment in enumerate(listOfFileNames):
    dfData = pd.read_csv(listOfFileNames[i] + '_BCCounts.csv', index_col = 0)
    dfData.index.name = 'Barcode'
    combinedDF = combinedDF.merge(dfData, on = 'Barcode', how = 'outer')
combinedDF = combinedDF.convert_dtypes()

# This is the remaining pacbiodata
# Need to collapse it to uniques first
PacBioBCTable = pd.read_csv('PacBio_BCCounts.csv', usecols = ('Barcode_sequence','BCfrequency'))
PacBioBCTable.rename(columns = {'BCfrequency' : 'Raw_PacBio_counts', 'Barcode_sequence' : 'Barcode'}, 
                     inplace = True)
PacBioBCTable.drop_duplicates(inplace = True)
combinedDF = combinedDF.merge(PacBioBCTable, how = 'outer', on = 'Barcode').convert_dtypes()

# Retrieve original AA, mutant AA and the position (corrected)
def parseMut(row):
    mut = row['correctedAAmut']
    if pd.isnull(mut):
        return np.nan, np.nan, np.nan
    if mut == 'WT':
        return 'WT', 'WT', 'WT'
    else:
        originalAA = mut[:1]
        mutAA = str(mut[-1:])
        mutPos = int(mut[1:-1])
        return originalAA, mutAA, mutPos

combinedDF['MutParsed'] = combinedDF.apply(lambda row: parseMut(row), axis=1)
combinedDF[['OriginalAA','MutAA', 'AAPosition']] = pd.DataFrame(combinedDF.MutParsed.tolist(), 
                                                                index= combinedDF.index)
combinedDF.drop(columns = 'MutParsed', inplace = True)

# Load reference data
refData = pd.read_csv('biochemData060821.csv')
combinedDF = combinedDF.merge(refData, left_on = 'correctedAAmut', right_on = 'Mutation', how = 'outer')
combinedDF = combinedDF.convert_dtypes()



In [9]:
# Find enrichment values, do this in the simplest way possible
# Don't use PacBio for this
# Exclude anything with fewer than 5 reads before or after selection for now
# Normalize to WT

def calcEnrich(row, column, refColumn):
    enrichment = np.nan
    if not (pd.isnull(row[column]) or pd.isnull(row[refColumn])):
        if (row[column] >= MINIMUM_READS) and row[refColumn] >= MINIMUM_READS:
            enrichment = row[column]/row[refColumn]
    return enrichment

combinedDF['Rep1RawEnrichment'] = combinedDF.apply(lambda row: calcEnrich(row, 'NP_11_27_24', 
                                                                          'NP_11_27_23'), axis=1)
combinedDF['Rep2RawEnrichment'] = combinedDF.apply(lambda row: calcEnrich(row, 'NP_11_27_25', 
                                                                          'NP_11_27_23'), axis=1)

print('Raw enrichment calculated')

# Divide reads a given barcode has by the mean of WT reads (all 3 barcodes)
# Further normalization to the total number of reads doesn't seem necessary or helpful

WTmeanCounts = combinedDF[combinedDF['correctedAAmut'] == 'WT'][['NP_11_27_23', 'NP_11_27_24', 
                                                                 'NP_11_27_25']].mean()

def calcEnrichWTnorm(row, postSel, preSel, WTinPost):
    enrichment = np.nan
    if not (pd.isnull(row[postSel]) or pd.isnull(row[preSel])):
        if row[postSel] >= MINIMUM_READS and row[preSel] >= MINIMUM_READS:
            enrichment = (row[postSel]/WTinPost)/(row[preSel]/WTmeanCounts[0])
    return enrichment

combinedDF['WT_norm_enrich_rep1'] = combinedDF.apply(lambda row: calcEnrichWTnorm(row, \
                                    'NP_11_27_24', 'NP_11_27_23', WTmeanCounts[1]), axis=1)
combinedDF['WT_norm_enrich_rep2'] = combinedDF.apply(lambda row: calcEnrichWTnorm(row, \
                                    'NP_11_27_25', 'NP_11_27_23', WTmeanCounts[2]), axis=1)

print('WT norm enrichment calculated')


# This will need to be rewritten for more than 2 replicates
def averageEnrichCalc(row):
    return np.mean([row['WT_norm_enrich_rep1'], row['WT_norm_enrich_rep2']])

combinedDF['WT_norm_enrich_avg'] = combinedDF.apply(lambda row: averageEnrichCalc(row), axis=1)

print('Average enrichment calculated')


# Add columns for means and std. dev.s
# The mean is the mean of the average enrichments between replicates across all barcodes of a given mutation

means = pd.DataFrame(combinedDF.groupby('correctedAAmut')['WT_norm_enrich_avg'].mean()).rename(columns = 
                                                            {'WT_norm_enrich_avg' : 'mut_BCs_mean'})
combinedDF = combinedDF.merge(means, on = 'correctedAAmut', how = 'outer')
stdDevs = pd.DataFrame(combinedDF.groupby('correctedAAmut')['WT_norm_enrich_avg'].std()).rename(columns = 
                                                            {'WT_norm_enrich_avg' : 'mut_BCs_stdDev'})
combinedDF = combinedDF.merge(stdDevs, on = 'correctedAAmut', how = 'outer')

print('Barcode average enrichments calculated')


# Find how many barcodes there are for each mutant

BCsPerMut = pd.DataFrame(combinedDF.groupby('correctedAAmut')['Barcode'].nunique()).rename(columns = 
                                                            {'Barcode' : 'Number_of_BCs_for_mut'})
combinedDF = combinedDF.merge(BCsPerMut, on = 'correctedAAmut', how = 'outer')
combinedDF = combinedDF.convert_dtypes()

print('Barcodes per mut calculated')


# Do some statistics

# Calculate the "standard score" from the Kosuri GPCR paper
# "Note the standard score here is (x−μ)/σ where x is the forskolin ratio of that barcode
# μ is the mean forskolin ratio of the mutant that barcode corresponds to, and σ is the 
# standard deviation of the forskolin ratio of the mutant"
# Coefficient of variation is just the barcode average divided by the barcode stdDev
#  We need to be careful to only calculate stats for barcodes with enough data
#  I don't include barcodes with null values for WT_norm_enrich_avg

def calcStdScore(row):
    stdScore = np.nan
    x = row['WT_norm_enrich_avg']
    mu = row['mut_BCs_mean']
    sig = row['mut_BCs_stdDev']
    if sig != 0:
        stdScore = (x - mu)/sig
    return stdScore

def calcCoeffVar(row):
    coeffVar = np.nan
    mu = row['mut_BCs_mean']
    sig = row['mut_BCs_stdDev']
    x = row['WT_norm_enrich_avg']
    if sig != 0 and not math.isnan(x):
        coeffVar = mu/sig
    return coeffVar

combinedDF['Standard_score'] = combinedDF.apply(lambda row: calcStdScore(row), axis=1)
combinedDF['Coefficient_of_variation'] = combinedDF.apply(lambda row: calcCoeffVar(row), axis=1)
combinedDF.convert_dtypes()

print('Stats calculated')



Raw enrichment calculated
WT norm enrichment calculated
Average enrichment calculated
Barcode average enrichments calculated
Barcodes per mut calculated
Stats calculated


In [10]:
combinedDF.columns

Index(['Barcode', 'correctedAAmut', 'NP_11_27_23', 'NP_11_27_24',
       'NP_11_27_25', 'Raw_PacBio_counts', 'OriginalAA', 'MutAA', 'AAPosition',
       'Mutation', 'vcMUT/vcWT', 'Error (fraction of fraction)', 'Sc/o',
       'Sc/o error (+/-)', 'citation', 'Rep1RawEnrichment',
       'Rep2RawEnrichment', 'WT_norm_enrich_rep1', 'WT_norm_enrich_rep2',
       'WT_norm_enrich_avg', 'mut_BCs_mean', 'mut_BCs_stdDev',
       'Number_of_BCs_for_mut', 'Standard_score', 'Coefficient_of_variation'],
      dtype='object')

In [11]:
combinedDF.to_csv(experimentName + '.csv')