In [None]:
import pandas as pd

In [None]:
control_1 = pd.read_csv('VarScan_results_SRR1705860.csv')
control_1.head()

Unnamed: 0,position,reference_base,alternative_base,frequency
0,38,T,C,0.66%
1,54,T,C,0.3%
2,72,A,G,0.3%
3,95,A,G,0.24%
4,117,C,T,0.3%


In [None]:
control_2 = pd.read_csv('VarScan_results_SRR1705859.csv')
control_2.head()

Unnamed: 0,position,reference_base,alternative_base,frequency
0,44,T,C,0.47%
1,158,A,G,0.24%
2,165,T,C,0.27%
3,183,A,G,0.22%
4,193,A,G,0.22%


In [None]:
control_3 = pd.read_csv('VarScan_results_SRR1705858.csv')
control_3.head()

Unnamed: 0,position,reference_base,alternative_base,frequency
0,38,T,C,0.7%
1,44,T,C,0.5%
2,95,A,G,0.24%
3,105,A,G,0.25%
4,133,A,G,0.22%


In [None]:
def prepare_df(df):
  df['frequency'] = df['frequency'].apply(lambda x: x.strip('%')).astype(float)
  avg = df['frequency'].mean()
  std = df['frequency'].std()
  print('avg:', avg)
  print('std:', std)

In [None]:
prepare_df(control_1)

avg: 0.25649122807017544
std: 0.07172594738880801


In [None]:
prepare_df(control_2)

avg: 0.2369230769230769
std: 0.05237640770866741


In [None]:
prepare_df(control_3)

avg: 0.250327868852459
std: 0.07803775182808968


## Combine all

In [None]:
data = pd.concat([control_1, control_2, control_3])

avg = data.frequency.mean()
std = data.frequency.std()

avg, std

(0.24829411764705883, 0.06898268773972734)

## Filter significant mutations

In [None]:
roommate = pd.read_csv('roommate.csv')
roommate['frequency'] = roommate['frequency'].apply(lambda x: x.strip('%')).astype(float)

In [None]:
filtered = roommate[(roommate['frequency'] < avg - 3 * std) | (roommate['frequency'] > avg + 3 * std)]
filtered

Unnamed: 0,position,reference_base,alternative_base,frequency
0,72,A,G,99.96
1,117,C,T,99.82
4,307,C,T,0.94
10,774,T,C,99.96
14,999,C,T,99.86
18,1260,A,C,99.94
20,1458,T,C,0.84


## Epitope mapping

In [None]:
epitopes = {
    'A': [122, 124, 126, 130, 131, 132, 133, 135, 137, 138, 140, 142, 143, 144, 145, 146, 150, 152, 168],
    'B': [128, 129, 155, 156, 157, 158, 159, 160, 163, 165, 186, 187, 188, 189, 190, 192, 193, 194, 196, 197, 198],
    'C': [44, 45, 46, 47, 48, 50, 51, 53, 54, 273, 275, 276, 278, 279, 280, 294, 297, 299, 300, 304, 305, 307, 308, 309, 310, 311, 312],
    'D': [
        96, 102, 103, 117, 121, 167, 170 , 171, 172, 173, 174, 175, 176, 177, 179, 182, 201, 203, 207, 208, 209, 
        212, 213, 214, 215, 216, 217, 218, 219, 226, 227, 228, 229, 230, 238, 240, 242, 244, 246, 247, 248,
    ],
    'E': [57, 59, 62, 63, 67, 75, 78, 80, 81, 82, 83, 86, 87, 88, 91, 92, 94, 109, 260, 261, 262, 265]
}

def find_epitope(x):
    protein_pos = (x - 1) // 3 + 1
    for k in epitopes.keys(): 
        if protein_pos in epitopes[k]: 
            return k
    return None

roommate['epitope'] = roommate.position.apply(find_epitope)
roommate[~roommate['epitope'].isna()]

Unnamed: 0,position,reference_base,alternative_base,frequency,epitope
3,276,A,G,0.17,E
4,307,C,T,0.94,D
6,389,T,C,0.22,A
9,744,A,G,0.17,D
13,915,T,C,0.19,C
