In [1]:
import os
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [288]:
#samples = ['20279217','20286419','20328673']
samples = ['20411178','20412199','20412203']

cov = '/home/laura/ANALYSIS/covidma/Stats/Coverage/20353538.cov'
tsv = '/home/laura/ANALYSIS/covidma/Variants/ivar_raw/20353538.tsv'

In [289]:
def import_tsv_variants(tsv_file,  min_total_depth=4, min_alt_dp=4, only_snp=True):
    base_file = os.path.basename(tsv_file)
    input_file = os.path.abspath(tsv_file)
    sample = base_file.split(".")[0]

    df = pd.read_csv(input_file, sep='\t')
    df = df.drop_duplicates(subset=['POS', 'REF', 'ALT'], keep="first")

    df = df[((df.TOTAL_DP >= min_total_depth) &
                    (df.ALT_DP >= min_alt_dp))]

    df = df[['REGION','POS', 'REF', 'ALT', 'ALT_FREQ']]
    df = df.rename(columns={'ALT_FREQ' : sample})
    if only_snp == True:
        df = df[~(df.ALT.str.startswith('+') | df.ALT.str.startswith('-'))]
        return df
    else:
        return df

In [427]:
def extract_lowfreq(tsv_file,  min_total_depth=4, min_alt_dp=4, only_snp=True):
    base_file = os.path.basename(tsv_file)
    input_file = os.path.abspath(tsv_file)
    sample = base_file.split(".")[0]

    df = pd.read_csv(input_file, sep='\t')
    df = df.drop_duplicates(subset=['POS', 'REF', 'ALT'], keep="first")

    df = df[(df.ALT_DP <= min_alt_dp)]

    df = df[['REGION','POS', 'REF', 'ALT', 'ALT_FREQ']]
    df['ALT_FREQ'] = '-'
    df = df.rename(columns={'ALT_FREQ' : sample})
    if only_snp == True:
        df = df[~(df.ALT.str.startswith('+') | df.ALT.str.startswith('-'))]
        return df
    else:
        return df

In [421]:
def extract_uncovered(cov_file, min_total_depth=4):
    base_file = os.path.basename(cov_file)
    input_file = os.path.abspath(cov_file)
    sample = base_file.split(".")[0]

    df = pd.read_csv(input_file, sep="\t", header=None)
    df.columns = ['REGION', 'POS', sample]
    df = df[df[sample] == 0]
    df = df.replace(0,'!')
    return df

In [444]:
dfc = extract_uncovered(cov)
dfc.head()

Unnamed: 0,REGION,POS,20353538
0,NC_045512.2,1,!
1,NC_045512.2,2,!
2,NC_045512.2,3,!
3,NC_045512.2,4,!
4,NC_045512.2,5,!


In [452]:
dfl = extract_lowfreq(tsv)
dfl

Unnamed: 0,REGION,POS,REF,ALT,20353538
0,NC_045512.2,4094,A,C,-
4,NC_045512.2,6354,C,A,-
6,NC_045512.2,6379,T,A,-
8,NC_045512.2,17343,T,A,-
10,NC_045512.2,24948,T,C,-
11,NC_045512.2,27024,A,T,-
12,NC_045512.2,27045,A,T,-
13,NC_045512.2,26144,G,T,-


In [446]:
dfv = import_tsv_variants(tsv)
dfv.head()

Unnamed: 0,REGION,POS,REF,ALT,20353538
2,NC_045512.2,6286,C,T,1.0
9,NC_045512.2,23403,A,G,1.0


In [526]:
df = pd.DataFrame(columns=['REGION','POS', 'REF', 'ALT'])
#Merge all raw
for root, _, files in os.walk('/home/laura/ANALYSIS/covidma/Variants/ivar_raw'):
    for name in files:
        if name.endswith('.tsv'):
            filename = os.path.join(root, name)
            dfv = import_tsv_variants(filename)
            df = df.merge(dfv, how='outer')
#Rounf frequencies
df = df.round(2)
#Remove <= 0.1 (parameter in function)
handle_lowfreq = lambda x: None if x <= 0.1 else x # IF HANDLE HETEROZYGOUS CHANGE THIS 0 for X or 0.5
df.iloc[:,4:] = df.iloc[:,4:].applymap(handle_lowfreq)
#Drop all NaN rows
df['AllNaN'] = df.apply(lambda x: x[4:].isnull().values.all(), axis=1)
df = df[df.AllNaN == False]
df = df.drop(['AllNaN'], axis=1).reset_index(drop=True)


In [527]:
df.sort_values(by=['POS'])

Unnamed: 0,REGION,POS,REF,ALT,20091271,20353451,20353538,20179107,20089101,20277322,20069302,20143801
38,NC_045512.2,147,C,T,,,,,,,,0.43
0,NC_045512.2,241,C,T,1.0,,,,1.0,1.0,,
20,NC_045512.2,445,T,C,,,,,,1.0,,
21,NC_045512.2,973,T,G,,,,,,0.16,,
34,NC_045512.2,1344,T,C,,,,,,,0.77,
18,NC_045512.2,2939,C,T,,,,,1.0,,,
1,NC_045512.2,3037,C,T,1.0,,1.0,,1.0,1.0,,
8,NC_045512.2,6286,C,T,,1.0,1.0,,,1.0,,
2,NC_045512.2,6696,C,T,1.0,,,,,,,
35,NC_045512.2,8782,C,T,,,,,,,1.0,1.0


In [528]:
#Include poorly covered
for root, _, files in os.walk('/home/laura/ANALYSIS/covidma/Variants/ivar_raw'):
    for name in files:
        if name.endswith('.tsv'):
            filename = os.path.join(root, name)
            sample = name.split('.')[0]
            dfl = extract_lowfreq(filename)
            df[sample].update(df[['REGION', 'POS', 'REF', 'ALT']].merge(dfl, on=['REGION', 'POS', 'REF', 'ALT'], how='left')[sample])


In [529]:
df.sort_values(by=['POS'])

Unnamed: 0,REGION,POS,REF,ALT,20091271,20353451,20353538,20179107,20089101,20277322,20069302,20143801
38,NC_045512.2,147,C,T,,,,,,,,0.43
0,NC_045512.2,241,C,T,1.0,,-,,1.0,1.0,,
20,NC_045512.2,445,T,C,,,,,,1.0,,
21,NC_045512.2,973,T,G,,,,,,0.16,,
34,NC_045512.2,1344,T,C,,,,,,,0.77,
18,NC_045512.2,2939,C,T,,,,,1.0,,,
1,NC_045512.2,3037,C,T,1.0,,-,,1.0,1.0,,
8,NC_045512.2,6286,C,T,,1,1,,,1.0,,
2,NC_045512.2,6696,C,T,1.0,,,,,,,
35,NC_045512.2,8782,C,T,,,,,,,1.0,1.0


In [533]:
#Include uncovered
for root, _, files in os.walk('/home/laura/ANALYSIS/covidma/Stats/Coverage'):
    for name in files:
        if name.endswith('.cov'):
            filename = os.path.join(root, name)
            sample = name.split('.')[0]
            if sample in df.columns[4:]:
                dfc = extract_uncovered(filename)
                #df.update(df[['REGION', 'POS']].merge(dfc, on=['REGION', 'POS'], how='left'))
                df[sample].update(df[['REGION', 'POS']].merge(dfc, on=['REGION', 'POS'], how='left')[sample])
                #df.combine_first(df[['REGION', 'POS']].merge(dfc, how='left'))
#Asign 0 to rest (Absent)
df = df.fillna(0)


In [534]:
df.sort_values(by=['POS'])

Unnamed: 0,REGION,POS,REF,ALT,20091271,20353451,20353538,20179107,20089101,20277322,20069302,20143801
38,NC_045512.2,147,C,T,0.0,0,0,0.0,0.0,0.0,0.0,0.43
0,NC_045512.2,241,C,T,1.0,!,-,0.0,1.0,1.0,0.0,0.0
20,NC_045512.2,445,T,C,0.0,!,!,0.0,0.0,1.0,0.0,0.0
21,NC_045512.2,973,T,G,0.0,!,0,0.0,0.0,0.16,0.0,0.0
34,NC_045512.2,1344,T,C,0.0,0,!,0.0,0.0,0.0,0.77,0.0
18,NC_045512.2,2939,C,T,0.0,!,0,0.0,1.0,0.0,0.0,0.0
1,NC_045512.2,3037,C,T,1.0,!,-,0.0,1.0,1.0,0.0,0.0
8,NC_045512.2,6286,C,T,0.0,1,1,0.0,0.0,1.0,0.0,0.0
2,NC_045512.2,6696,C,T,1.0,!,0,0.0,0.0,0.0,0.0,0.0
35,NC_045512.2,8782,C,T,0.0,!,0,0.0,0.0,0.0,1.0,1.0


In [477]:
#Determine N (will help in poorly covered determination)
def estract_sample_count(row):
    count_list = [i not in ['!',0,'0'] for i in row[4:]]
    samples = np.array(df.columns[4:])
    #samples[np.array(count_list)] filter array with True False array
    return (sum(count_list), (',').join(samples[np.array(count_list)]))

if 'N' in df.columns:
    df = df.drop(['N','Samples'], axis=1)
if 'Position' in df.columns:
    df = df.drop('Position', axis=1)

df[['N', 'Samples']] = df.apply(estract_sample_count, axis=1, result_type='expand')

df['Position'] = df.apply(lambda x: ('|').join([x['REGION'],x['REF'],str(x['POS']),x['ALT']]), axis=1)

df = df.drop(['REGION','REF','POS','ALT'], axis=1)

df = df[['Position', 'N', 'Samples'] + [ col for col in df.columns if col not in ['Position', 'N', 'Samples']]]

In [516]:
df

Unnamed: 0,Position,N,Samples,20091271,20353451,20353538,20179107,20089101,20277322,20069302,20143801
0,NC_045512.2|C|241|T,4,20091271203535382008910120277322,1.0,!,-,0.0,1.0,1.0,0.0,0.0
1,NC_045512.2|C|3037|T,4,20091271203535382008910120277322,1.0,!,-,0.0,1.0,1.0,0.0,0.0
2,NC_045512.2|C|6696|T,1,20091271,1.0,!,0,0.0,0.0,0.0,0.0,0.0
3,NC_045512.2|C|14408|T,5,2009127120353451203535382008910120277322,1.0,-,-,0.0,1.0,1.0,0.0,0.0
4,NC_045512.2|T|14565|A,1,20091271,0.66,!,0,0.0,0.0,0.0,0.0,0.0
5,NC_045512.2|A|20268|G,2,2009127120089101,1.0,!,!,0.0,1.0,0.0,0.0,0.0
6,NC_045512.2|G|20373|A,1,20091271,1.0,!,!,0.0,0.0,0.0,0.0,0.0
7,NC_045512.2|A|23403|G,4,20091271203535382008910120277322,1.0,!,1,0.0,1.0,1.0,0.0,0.0
8,NC_045512.2|C|6286|T,3,203534512035353820277322,0.0,1,1,0.0,0.0,1.0,0.0,0.0
9,NC_045512.2|T|13312|G,1,20353451,0.0,0.55,0,0.0,0.0,0.0,0.0,0.0


In [512]:
df0 = df.replace('!', 0)

In [513]:
#df0['valid'] = df0.apply(lambda x: sum([i != '-' and float(i) > 0.7 for i in x[3:]]), axis=1)

In [515]:
#df0.iloc[:,3:] = df0.apply(lambda x: handle_lowfreq(x, min_freq_include=0.8), axis=1)

In [522]:
def revised_df(df, min_freq_include=0.7, min_threshold_discard=0.7, remove_faulty=True, drop_samples=True, drop_positions=True):
    if remove_faulty == True:

        uncovered_positions = df.iloc[:,3:].apply(lambda x:  sum([i in ['!','-'] for i in x.values])/len(x), axis=1)
        heterozygous_positions = df.iloc[:,3:].apply(lambda x: sum([i not in ['!',0,1, '0', '1'] for i in x.values])/len(x), axis=1)
        report_position = pd.DataFrame({'Position': df.Position, 'uncov_fract': uncovered_positions, 'htz_frac': heterozygous_positions, 'faulty_frac': uncovered_positions + heterozygous_positions})
        faulty_positions = report_position['Position'][report_position.faulty_frac >= min_threshold_discard].tolist()


        uncovered_samples = df.iloc[:,3:].apply(lambda x: sum([i in ['!','-'] for i in x.values])/len(x), axis=0)
        heterozygous_samples = df.iloc[:,3:].apply(lambda x: sum([i not in ['!',0,1, '0', '1'] for i in x.values])/len(x), axis=0)
        report_samples = pd.DataFrame({'sample': df.iloc[:,3:].columns, 'uncov_fract': uncovered_samples, 'htz_frac': heterozygous_samples, 'faulty_frac': uncovered_samples + heterozygous_samples})
        faulty_samples = report_samples['sample'][report_samples.faulty_frac >= min_threshold_discard].tolist()

        if drop_positions == True:
            df = df[~df.Position.isin(faulty_positions)]
        if drop_samples == True:
            df = df.drop(faulty_samples, axis=1)

        print('FAULTY POSITIONS:\n{}\n\nFAULTY SAMPLES:\n{}'.format(("\n").join(faulty_positions), ("\n").join(faulty_samples)))

    #Uncovered to 0
    df = df.replace('!', 0)

    #Number of valid to remove o valid and replace lowfreq
    df['valid'] = df.apply(lambda x: sum([i != '-' and float(i) > min_freq_include for i in x[3:]]), axis=1)
    df = df[df.valid > 1]
    df = df.drop('valid', axis=1)
    df = df.replace('-', 1)

    df.iloc[:,3:] = df.iloc[:,3:].astype(float)

    #MReplace Htz to 0
    f = lambda x: 1 if x >= min_freq_include else 0 # IF HANDLE HETEROZYGOUS CHANGE THIS 0 for X or 0.5
    df.iloc[:,3:] = df.iloc[:,3:].applymap(f)

    df.N = df.apply(lambda x: sum(x[3:]), axis=1)

    #Remove positions with 0 samples after htz
    df = df[df.N > 0]
    
    return df

In [524]:
revised_df(df)

FAULTY POSITIONS:


FAULTY SAMPLES:
20353451


Unnamed: 0,Position,N,Samples,20091271,20353538,20179107,20089101,20277322,20069302,20143801
0,NC_045512.2|C|241|T,4,20091271203535382008910120277322,1,1,0,1,1,0,0
1,NC_045512.2|C|3037|T,4,20091271203535382008910120277322,1,1,0,1,1,0,0
2,NC_045512.2|C|6696|T,1,20091271,1,0,0,0,0,0,0
3,NC_045512.2|C|14408|T,4,2009127120353451203535382008910120277322,1,1,0,1,1,0,0
5,NC_045512.2|A|20268|G,2,2009127120089101,1,0,0,1,0,0,0
6,NC_045512.2|G|20373|A,1,20091271,1,0,0,0,0,0,0
7,NC_045512.2|A|23403|G,4,20091271203535382008910120277322,1,1,0,1,1,0,0
8,NC_045512.2|C|6286|T,2,203534512035353820277322,0,1,0,0,1,0,0
10,NC_045512.2|C|26801|G,2,2035353820277322,0,1,0,0,1,0,0
11,NC_045512.2|G|11083|T,1,20179107,0,0,1,0,0,0,0
