# Mutational Clustering

Computes multichain mutational clustering
## Input files:
* Structure pdb file (cleaned or not)
* Cosmic *.csv* file from https://cancer.sanger.ac.uk/cosmic
>> *Gene page-> Variants-> Mutations -> Export: csv*
* Consurf *.txt* file from https://consurf.tau.ac.il/consurf_index.php
>> *Launch job-> View results summary (download)*

## Input variables
* Consurf file sequence reading option:
>* s for SEQRES derived sequence
>* a for ATOM derived sequence
* Output name
* Output directory pathname
* Required chain for plotting

## Output variables
* Count = number of total mutations from Cosmic *.csv* file
* Variants = number of variants from Cosmic *.csv* file
* CS Score = conservation score from Consurf *.txt* file
* W1 Score = Count * CS score /20
* Neighbs = number of neighboring residues within 8 Å cutoff
* M Neighbs = number of mutated neighbors
* Neigh Muts = number of total mutations on neighbors
* W2 Score = CS score / 9 * Neigh Muts
* W2 Score = normalized W2 Score










In [None]:
#import time
#start_time= time.time ()
import pandas as pd
import numpy as np
import re
from collections import OrderedDict
from scipy.spatial import distance

pd.set_option('display.max_columns', None)
#pd.set_option('display.width', 300)
pd.set_option('display.max_rows', None)
#pd.reset_option('all')

In [None]:
filein= input ('Enter PDB file pathname:\n')
with open (filein, 'r') as fin:
    wholepdb=fin

    colspecs = [(0, 6), (6, 11), (12, 16), (16, 17), (17, 20), (21, 22), (22, 26),
            (26, 27), (30, 38), (38, 46), (46, 54), (54, 60), (60, 66), (76, 78),
            (78, 80)]

    names = ['ATOM', 'serial', 'name', 'altloc', 'resname', 'chainid', 'resseq',
         'icode', 'x', 'y', 'z', 'occupancy', 'tempfactor', 'element', 'charge']

    pdb = pd.read_fwf(wholepdb, names=names, colspecs=colspecs)

    pdb = pdb.loc[pdb['ATOM'] == 'ATOM']
    pdb = pdb.loc[pdb['name'] == 'CA']
    pdb = pdb.loc[(pdb['altloc'].isna()) | (pdb['altloc']== 'A')]
    pdb = pdb.set_index('resseq')
    pdb.index = pdb.index.astype(int)

pdb.head()

In [None]:
""" .csv mutation export file in """

def opencsv(fin, chain):
    mut=pd.read_csv(fin, index_col=0)

    # dropping the rows having NaN index values

    mut = mut[mut.index.notnull()]
    mut.index = mut.index.astype(int)
    mut = mut.drop(labels=['CDS Mutation', 'AA Mutation', 'Legacy Mutation ID'], axis=1)

    # selecting for Missense
    mut = mut.loc[mut['Type'].str.contains('Missense')]
    mut = mut.drop(labels=['Type'], axis=1)

    # adding Variants column
    mut['Variants'] = 1

    # grouping and summing variants
    mut_= mut.groupby(['Position'])[['Count', 'Variants']].sum()

    # filling missing residues by PDB index with 0
    idx= grouped.get_group(chain).index
    cv=mut_.reindex(idx, fill_value=0)
    cv.index.names=['Residue']

    cv.insert(loc=0, column='Chain', value=grouped.get_group(chain)['chainid'])
    cv.index=cv.index.astype(int)

    return (cv, mut_)

#cv, mut_=opencsv(fin)

In [None]:
grouped = pdb.groupby(pdb['chainid'], sort=False)
#grouped.get_group('L')


#pdb['chainid'].unique()
#print(len(grouped))
#x, y= grouped
#eug=x[1]

chains=[]
for name,group in grouped:
    chains.append(name)

chains

In [None]:
opt= input('Select consurf file sequence reading option:\n\ts for SEQRES derived sequence\n\ta for ATOM derived sequence\n\t')
if opt != 's' and opt != 'a':
    raise ValueError('Please type "s" (SEQRES) or "a" (ATOM)')

def openconsurf (filein):
    mylist= list (filein)
    mutlist = [line.rstrip ('\n') for line in mylist]
    for ele in mutlist:
      if '(normalized)' in ele:
        cut= mutlist.index(ele)+1

    del mutlist[:cut]
    del mutlist[-4:]


    def delim():
        split=[]

        for ele in mutlist:
            delimiter='\t'
            t=ele.split(delimiter)
            if '-' not in t[2]:
                if opt=='s':
                  a=t[0].strip(' '), t[5].strip(' *')
                if opt=='a':
                  res= re.search('\d+',t[2])
                  a=int(res.group()), t[5].strip(' *')
                split.append(a)

        return split

    mcs= (delim())

    mc= pd.DataFrame(mcs, columns=['Residue', 'CS score'])
    mc['CS score'] = mc['CS score'].astype(int)
    mc=mc.set_index("Residue")
    mc.index=mc.index.astype(int)

    return (mc)

#mc= openconsurf (fin)

In [None]:
df_dict=OrderedDict()
mut_dict=OrderedDict()
for ele in chains:

    csvin= input ('Enter mutation .csv file path for chain {}:\n'.format(ele))
    with open (csvin, 'r') as fin:
     cv, mut_= opencsv(fin, ele)


    consurfin= input ('Enter consurf .txt file path for chain {}:\n'.format(ele))
    with open (consurfin, 'r') as fin:
      mc= openconsurf(fin)

    cvc= pd.concat ([cv, mc], axis=1)
    cvc.index.name = None

    cvc['W1 score']=(cvc['Count']*cvc['CS score'])/20

    df_dict[ele] = cvc
    mut_dict[ele] = mut_



In [None]:
""" concatenate all dataframes """

compw1= pd.concat (df_dict.values())
#compw1


In [None]:
compw1=compw1.dropna()
compw1

In [None]:
multins=compw1.set_index(['Chain', compw1.index], drop=True)

multis=multins.sort_index()
multins

In [None]:
pdbs=pdb.set_index(['chainid', pdb.index], drop=True)
pdbs=pdbs.sort_index()
pdbs

In [None]:
outname = input ('enter output name:\n')
outdir = input ('enter output directory pathname:\n')

In [None]:
""" write vmd W1 score visualization state """

def writevmd ():
    data = {
        'form1': ' draw sphere "',
        'x': pdbs['x'],
        'y': pdbs['y'],
        'z': pdbs['z'],
        'form2': ' " ',
        'form3': 'radius',
        'W score': multis['W1 score'],
        'form4': 'resolution 20'
    }


    vmd=pd.DataFrame(data=data)
    vmd=vmd.dropna()
    #vmd.index.name = None

    vmdout= open('{}/{}_w1.vmd'.format(outdir, outname), 'w')
    hd=  ' molecule new\n display resetview\n draw color orange\n'
    vmdout.write(hd+vmd.to_string(header= False, index=False))
    vmdout.close()

writevmd()

In [None]:
""" neighboring residues """

data= {
    'chainid': pdb['chainid'],
    'x': pdb['x'],
    'y': pdb['y'],
    'z': pdb['z']}

coo=pd.DataFrame(data=data, index=pdb.index)
coo=coo.set_index(['chainid',pdb.index.astype(int)], drop= True)
coo=coo.sort_index()
coo

In [None]:
#mfa= coo.loc[('G', 368):('L', 4), :]
#mic= mfa.to_numpy()
#mfa

In [None]:
""" matrix parsing function """

def ret(din):
    d= {} #number of neighbors
    dn= {} #number of mut neighbors
    l= [] #neigh pairs
    df= {} #sum of mutation frequencies on neighbors
    for key, value in (din).items():
        d[key]=0
        dn[key]=0
        df[key]=0
        for n in (din.index):
            if (value[n])>8 or (value[n]) == 0:
                continue
            else:
                d[key] += 1
                m = (n, key)
                l.append(m)

                if multis.loc[n, 'Count'] != 0:
                    dn[key] += 1
                    df[key] += multis.loc[n, 'Count']

            #print (value[n], 'row', n, 'col', key)
            #print (d)
        #print ('residue:', key)
        #print ('neigh=', d[key])
    return (d, l, dn, df)

#ret(fos)

In [None]:
""" building neighboring matrix """
coo=coo.astype(float)
boom= coo.loc[:, ['x','y','z']].to_numpy()
#dbr=pd.DataFrame(boom, columns=['x', 'y', 'z'], index=pdb.index)

mat=distance.cdist(boom, boom, 'euclidean')

matr=pd.DataFrame(mat, columns=coo.index, index=coo.index)

In [None]:
neighcount, neigh, mutneighcount, neighfreqs= ret(matr)

In [None]:
multis['Neighbs']=neighcount.values() #number of neighbors
multis['M Neighbs']= mutneighcount.values() #number of mut neighbors
multis['Neigh Muts']= neighfreqs.values() #sum of mutation frequencies on neighbors
multis['W2 Score']= ((multis['CS score']/9)*multis['Neigh Muts']).round(2)

for key, value in multis['W2 Score'].items():
    if multis.loc[key,'Count'] == 0:
        multis.loc[key,'W2 Score'] = 0

w2= multis['W2 Score']
multis['W2 Norm']= ((w2 - w2.min()) / (w2.max() - (w2.min())) * 100).round(2)

In [None]:
""" export clustering text file """

def multiout():
    multis.index.name = 'Res'
    multiout= open('{}/{}_clustering.txt'.format(outdir, outname), 'w')
    out= multis.to_string()
    multiout.write(out)
    multiout.close()

multiout()

In [None]:
""" export clustering sort by w1 """
def sortw1out():
    sortw1=multis.sort_values(by=['W1 score'], ascending=False)
    sortw1out= open('{}/{}_sort_w1.txt'.format(outdir, outname), 'w')
    out= sortw1.to_string()
    sortw1out.write(out)
    sortw1out.close()

sortw1out()

In [None]:
def sortw2out():
    sortw2=multis.sort_values(by=['W2 Score'], ascending=False)
    sortw2out= open('{}/{}_sort_w2.txt'.format(outdir, outname), 'w')
    out= sortw2.to_string()
    sortw2out.write(out)
    sortw2out.close()

sortw2out()

In [None]:
def sortcount():
    sortc=multis.sort_values(by=['Count'], ascending=False)
    sortcount= open('{}/{}_sort_count.txt'.format(outdir, outname), 'w')
    out= sortc.to_string()
    sortcount.write(out)
    sortcount.close()

sortcount()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
pchain= input ('Input required chain for plotting\n')
pmultis= multis.reset_index(level=[0,1])
pmultis.rename(columns = {'level_1':'Residue'}, inplace= True)


In [None]:
mgj=pmultis[pmultis['Chain']== pchain]
pcount=mgj.plot(y='Count', x= 'Residue')
fout= '{}/{}_plot_count_chain{}.png'.format(outdir, outname, pchain)
plt.savefig(fout, dpi=300, facecolor='white', bbox_inches='tight')

pw1=mgj.plot(y='W1 score', x= 'Residue')
fout= '{}/{}_plot_w1_chain{}.png'.format(outdir, outname, pchain)
plt.savefig(fout, dpi=300, facecolor='white', bbox_inches='tight')

pw2=mgj.plot(y='W2 Norm', x= 'Residue')
fout= '{}/{}_plot_w2_chain{}.png'.format(outdir, outname, pchain)
plt.savefig(fout, dpi=300, facecolor='white', bbox_inches='tight')

In [None]:
pdbw2= pdb
pdbw2= pdbw2.set_index(['chainid', pdb.index], drop=False)
pdbw2= pdbw2.sort_index()
mf=pdbw2.index.get_level_values(1)
pdbw2.insert(loc=6, column='res', value=mf)
pdbw2=pdbw2.drop(labels=['element', 'charge', 'altloc'], axis=1)
pdbw2['tempfactor']=multis['W2 Norm']
pdbw2['icode']='  '
pdbw2['occupancy']='1.00'
pdbw2= pdbw2.loc[pdbw2['tempfactor'] != 0]
pdbw2['serial']=pdbw2['serial'].astype(int)
pdbw2=pdbw2.sort_values('serial')
pdbw2.head()

In [None]:
def pdbw2out():
    pdbwout= open('{}/{}_w2.pdb'.format(outdir, outname), 'w')
    col_space= {'serial':6, 'name':3, 'resname':4, 'occupancy':5, 'tempfactor':5}
    out= pdbw2.to_string(col_space=col_space, header= False, index=False)
    pdbwout.write(out)
    pdbwout.close()

pdbw2out()