# From Genotype data to IMI, Breakpts, and Haplotypes

###### From Genotype data calculate for each sample, the intermarker interval (IMI), find genotype changes or break points (breakpts), and construct haplotypes. Then filter this data to generate new genotype data for each sample and visualize the effect of our filter. 

#### Import needed moduals

In [1]:
import numpy as np, pandas as pd, scipy.stats as ss, os
from matplotlib import pyplot as plt
%matplotlib inline

## Set local variables 

In [2]:
mod = .1
s = 0
ymod = 1
yla = -.5
ylb = 1.5
xlim0 = -100000
xlim1 = 500000
xdi = 4.0
k1 = 6000
Method = 'min'
alvl =.3
centcolor='red'
snpcolor='black'

### Load Genotype files, centromere locations, and chromosome length

In [3]:
#genodf = pd.read_csv('/Users/croth/Desktop/Hypermutator/FILES/Bt65_prog_QTL_info.csv',index_col=0)
#genodf.drop('ANN',axis=1,inplace=True)
#genodf['Chrom'] = [int(a.split('_')[-1]) for a in genodf.Chrom.tolist()]

In [4]:
geno_path = '../../GENOTYPE/Bt22xFtc555-1_loci_cor.csv.gz'
genodf = pd.read_csv(geno_path)
genodf.shape

(46760, 331)

In [5]:
## View head
genodf.head()

Unnamed: 0,Oldindex,Chrom,Pos,Alleles,Type,Newpos,Newchrom,Oldchrom,Maf,PMY2556,...,PMY2931,PMY2932,PMY2933,PMY2934,PMY2935,PMY2936,PMY2937,PMY2938,PMY2939,PMY2940
0,5,1,25980,C.T,snp,25980,1.0,1.0,0.491857,1,...,0,1,1,0,1,0,1,1,0,0
1,7,1,26119,ATT.GTT,snp,26119,1.0,1.0,0.491857,1,...,0,1,1,0,1,0,1,1,0,0
2,12,1,26587,ACCTT.TCCAT.TCCTT.GCCTT,complex,26587,1.0,1.0,0.491857,1,...,0,1,1,0,1,0,1,1,0,0
3,15,1,27290,TCCC.CCCC,snp,27290,1.0,1.0,0.491857,1,...,0,1,1,0,1,0,1,1,0,0
4,17,1,28274,GCGCT.ACGCT,snp,28274,1.0,1.0,0.491857,1,...,0,1,1,0,1,0,1,1,0,0


In [6]:
centromere = pd.DataFrame([ ## This was taken from Vikas Yadav's paper in PNAS 2018
    [970169,835384,1378288,708804,1559983,780649,525714,## Chromosome 3 was edited here to include a 1 at beginning. 
    451162,801830,199434,868824,139633,579772,441845],
    [1006931,889427,1409632,752337,1587231,821756,584338,
    512653,839446,243741,933658,171048,632362,477986]]).T

In [7]:
h99clen = pd.read_csv('../../NOTES/H99_chrommap.csv')[:-1]
h99clen.index = h99clen.Chrom
h99clen

Unnamed: 0_level_0,Chrom,Seqid,Length,Cumpos,Midpts
Chrom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,CP003820.1,2291499,0,1145749.5
2,2,CP003821.1,1621675,2291499,3102336.5
3,3,CP003822.1,1575141,3913174,4700744.5
4,4,CP003823.1,1084805,5488315,6030717.5
5,5,CP003824.1,1814975,6573120,7480607.5
6,6,CP003825.1,1422463,8388095,9099326.5
7,7,CP003826.1,1399503,9810558,10510309.5
8,8,CP003827.1,1398693,11210061,11909407.5
9,9,CP003828.1,1186808,12608754,13202158.0
10,10,CP003829.1,1059964,13795562,14325544.0


In [8]:
clens = h99clen.Length.values

### Find Chromosome and gather samples

In [9]:
chrlist = sorted(genodf.Chrom.unique())
chrlist

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [10]:
segs = [s for s in genodf.columns if s[:3] == 'PMY']
len(segs)

322

### Set folder paths

In [11]:
folders = ['IMI/','BPS/','HAP/','FGT/','FIG/']

CHRPATHS = ['./FILES/CHR' + str(i) +'/' for i in range(1,len(clens)+1)];CHRPATHS[0]

FILECHRPATHS = [ [chp + f for f in folders] for chp in CHRPATHS];FILECHRPATHS[0][:]

['./FILES/CHR1/IMI/',
 './FILES/CHR1/BPS/',
 './FILES/CHR1/HAP/',
 './FILES/CHR1/FGT/',
 './FILES/CHR1/FIG/']

#### Make directory

In [12]:
for c in range(len(clens)):
    for directory in FILECHRPATHS[c]:
        for seg in segs:
            director = directory+seg
            if not os.path.exists(director):
                os.makedirs(director)

In [13]:
director

'./FILES/CHR14/FIG/PMY2940'

## Calculate IMI, Recombination Breakpts, and Haplotypes

In [14]:
def haplotypes(imi,clen,method='min'):
    """
    Calculates the size of the Crossover or non-crosover gene conversion track. 
    Aka the size of the haplotype. 
    Inputs: IMI, a DataFrame containing intermarkers itervals with 4 columns:
            v: starting marker position of IMI
            w: ending marker position of IMI
            X: Number of detected genotype changes
            IMI: The inter marker interval size. 

            CLEN, the length of the chromosome.

            METHOD, the method used for calculating the 5' and 3' positions of each haplotype.
            default is 'min'

    Output: HAPDF, a DataFrame containing haplotype information with 3 columns:
            v: 5' starting position of the haplotype. 
            w: 3' ending position of the haplotype. 
            nb: The size of the haplotype.  
    """
    ## Find breakpoints
    bp = imi[imi.X != 0].dropna(axis=0,how='any');
    ## 1st and last marker positions
    pos1 = int(imi.v[imi.index.tolist()[0]]);
    pose = int(imi.w[imi.index.tolist()[-1]]);
    ## Starting and ending markers in haplotypes
    start = [pos1] + bp.w.tolist()
    stops = bp.v.tolist() + [pose]
    ## List all markers involved adding 0 and chromosome length. 
    markers = sorted([0] + start + stops + [clen])
    ## Find intervals and size via minimal method:
    hapdf = pd.DataFrame()
    if method is 'min':
        hapdf['v'] = [markers[i] for i in range(1,len(markers)-1,2)];
        hapdf['w'] = [markers[i+1] for i in range(1,len(markers)-1,2)];
    elif method is 'max':
        hapdf['v'] = [markers[i-1]+1 for i in range(1,len(markers)-1,2)];
        hapdf['w'] = [markers[i+2]-1 for i in range(1,len(markers)-1,2)];
    else: ## Method is midpoint
        hapdf['v'] = [(markers[i-1]+markers[i])*.5 for i in range(1,len(markers)-1,2)];
        hapdf['w'] = [(markers[i+1]+markers[i+2])*.5 for i in range(1,len(markers)-1,2)];
    hapdf['nb'] = [int(hapdf.w[k] - hapdf.v[k] + 1) for k in hapdf.index.tolist()];
    return hapdf

def haplotype_filter(hapdf,k1):
    """
    Filters haplotype dataframe by size.
    
    Inputs: HAPDF, a dataframe with a chromosomal haplotype information. 
            This includes columns named,
            "v" beginning of haplotypes (left to right) chromosomal position in bases. 
            "w" ending of haplotypes in chromosomal positons. 
            "nb" The size of the haplotype.
            "genotype" the genotype that acures most often for the markers within a haplotype.

            K1, the cutoff size for the haplotypes. Haplotypes < K1 are filtered out. 
            
    Output: HAP, a dataframe with filtered chromosomal haplotypes and the same columns, 
            start, stop, size, and genotype. 
    """
    hap = hapdf.copy() 
    while min(hap.nb.tolist()) < k1: ## If the smallest haplotype is less than our filter
        ri = hap[(hap.nb == min(hap.nb.tolist()))].index[0]
        if ri == 0: ## First Haplotype
            temp = hap.loc[ri:ri+1,:];
            assert temp.genotype[ri] != temp.genotype[ri+1]
            nv = temp.v[ri];
            nw = temp.w[ri+1];
            nsize = nw - nv + 1;
            hap.loc[ri] = [nv,nw,nsize,abs(hap.genotype[ri] - 1)]
            hap.drop([ri+1],inplace=True)
        elif ri == len(hap) - 1: ## Last Haplotype (index zero based)
            temp = hap.loc[ri-1:ri,:];
            assert temp.genotype[ri] != temp.genotype[ri-1]
            nv = temp.v[ri-1];
            nw = temp.w[ri];
            nsize = nw - nv + 1;
            hap.loc[ri] = [nv,nw,nsize,abs(hap.genotype[ri] - 1)]
            hap.drop([ri-1],inplace=True)
        else: ## Any Haplotype between 1st and last haplotype
            temp = hap.loc[ri-1:ri+1,:];
            nv = temp.v[ri-1];
            nw = temp.w[ri+1];
            nsize = nw - nv + 1;
            hap.loc[ri] = [nv,nw,nsize,abs(hap.genotype[ri] - 1)]
            hap.drop([ri-1,ri+1],inplace=True)
        ## Reset index Calculate the smallest haplotype and run again. 
        hap.reset_index(drop = True,inplace=True)
    return hap

In [15]:
for ch,chrom in enumerate(chrlist):
    clen = clens[ch]; 
    geno_ch = genodf[genodf.Chrom==chrom]
    segsc = segs#[s for s in segs if s not in baddies[ch]]
    for s,seg in enumerate(segsc):
        sub = geno_ch[['Pos',seg]].dropna(axis=0,how='any').reset_index(drop=True); ## Drop empty markers
        if len(sub[seg]) == 0:
            continue
        else:
            #print sub
            subpos = sub.Pos.tolist();
            subgt = sub[seg].tolist();
        ## make IMI dataframe 
            subimi = pd.DataFrame();
            subimi['v'] = subpos[:-1];
            subimi['w'] = subpos[1:];
            subimi['X'] = abs(np.array(subgt[:-1]) - np.array(subgt[1:]));
            subimi['M'] = [np.mean((subimi.w[i],subimi.v[i])) for i in subimi.index.tolist()]; ## Midpoint Method
            subimi.to_csv(FILECHRPATHS[ch][0] +  seg + '/' + seg + '-IMI' + '.csv',
                      index=False);
        
        ## Find break pts dataframe. 
            bp = subimi[subimi.X != 0].dropna(axis=0,how='any');
            bp.to_csv(FILECHRPATHS[ch][1]+ seg + '/' + seg  + '-breakpt' + '.csv',
                  index=False);
        
        ## Make haplotype dataframe. 
            hapdf = haplotypes(subimi,clen,method=Method);
            hapdf['genotype'] = [ss.mode(sub[seg][(sub.Pos >= hapdf.v[i]) & 
                                              (sub.Pos <= hapdf.w[i])])[0].tolist()[0] 
                             for i in hapdf.index.tolist()]
            hapdf.to_csv(FILECHRPATHS[ch][2] +  seg + '/' + seg +'-'+ Method +'-haplotype' +'-k%s'%0+ '.csv',
                     index=False);
        
        ## Filter Haplotype dataframe. 
            hap = haplotype_filter(hapdf,k1);
            hap.to_csv(FILECHRPATHS[ch][2]+ seg + '/' + seg +'-'+ Method +'-haplotype' +'-k%s'%k1+ '.csv',
                   index=False);
        
        ## Generate filtered chromosomal positions and genotypes
            tempnewpos = [[pos for pos in subpos if pos <= hap.w[i] and 
                       pos >= hap.v[i]] for i in hap.index.tolist()]
            newgt = np.concatenate([[hap.genotype[K] for p in tempnewpos[K]] 
                                for K in hap.index.tolist()]);
            newpos = np.concatenate(tempnewpos);assert len(newgt) == len(newpos);
        
        ## Save the filtered genotype data
            newseg = pd.DataFrame(columns=['Pos',seg]);
            newseg.Pos = newpos;newseg[seg] = newgt;
            newseg.to_csv(FILECHRPATHS[ch][3]+ seg + '/' + seg +'-'+ Method + '-Geno' + '-k%s'%k1+ '.csv',
                     index=False)
        
        ## Vizualize Haplotypes
            plt.plot(subpos,np.array(subgt)*mod,'.',color=snpcolor,alpha=alvl);
            plt.plot(newpos,newgt*mod + ymod,'.',color=snpcolor,alpha=alvl);
            plt.annotate('Raw: %s'%len(hapdf),xy=(clen+(xlim1/xdi),0));
            plt.annotate('Filterd: %s'%len(hap),xy=(clen+(xlim1/xdi),ymod));
            plt.broken_barh([(centromere[0][ch],
                          centromere[1][ch]-centromere[0][ch])],(ymod,mod),
                        facecolors=centcolor,alpha =alvl);
            plt.broken_barh([(centromere[0][ch],
                          centromere[1][ch]-centromere[0][ch])],(0,mod),
                        facecolors=centcolor,alpha =alvl);
            plt.ylim(yla,ylb);
            plt.xlim(xlim0,clen+xlim1);
            plt.title(seg + ' ' +str(chrom)+'-haplotype ' + '\nMethod: '+ Method +'\nk1: ' + '%s'%k1);
            plt.xlabel('Chromosomal Coordinates');
            plt.ylabel('Genotype');
            cur_axes = plt.gca();
            cur_axes.axes.get_yaxis().set_visible(False);
            plt.tight_layout();
            plt.savefig(FILECHRPATHS[ch][4] + seg + '/'+ seg + '-' +str(chrom)+'-hap-'+ Method +'-k1-%s'%k1+'.png',
                        dpi=100,bbox_inches='tight', facecolor='w');
            plt.close();

## Make haplotypes for other filter sizes

In [None]:
assert 1 == 0

In [None]:
K = np.arange(1000,11500,500).tolist()

In [None]:
for ch,chrom in enumerate(chrlist):
    clen = clens[ch];
    geno_ch = genodf[genodf.Chrom==chrom]
    segsc = [s for s in segs if s not in baddies[ch]]
    for s,seg in enumerate(segsc):
        sub = geno_ch[['Pos',seg]].dropna(axis=0,how='any').reset_index(drop=True); ## Drop empty markers
        subpos = sub.Pos.tolist();
        subgt = sub[seg].tolist();
        if len(subgt) == 0:
            continue
        ## Bring in imi dataframe
        subimi = pd.read_csv(FILECHRPATHS[ch][0] +  seg + '/' + seg + '-IMI' + '.csv')
        ## For each filter size, make haplotype dataframe. 
        hapdf = haplotypes(subimi,clen,method=Method);
        hapdf['genotype'] = [ss.mode(sub[seg][(sub.Pos >= hapdf.v[i]) & 
                                              (sub.Pos <= hapdf.w[i])])[0].tolist()[0] 
                             for i in hapdf.index.tolist()]
        ## Filter Haplotype dataframe. 
        for k1 in K:
            hap = haplotype_filter(hapdf,k1);
            hap.to_csv(FILECHRPATHS[ch][2]+ seg + '/' + seg +'-'+ Method +'-haplotype' +'-k%s'%k1+ '.csv',
                   index=False);    