In [1]:
## Set mods
import numpy as np, pandas as pd, glob
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
## Load in chrom map
chrommappath = '../../NOTES/H99_chrommap.csv'
chrommap = pd.read_csv(chrommappath)[:-1]
chrommap.head()

Unnamed: 0,Chrom,Seqid,Length,Cumpos,Midpts
0,1,CP003820.1,2291499,0,1145749.5
1,2,CP003821.1,1621675,2291499,3102336.5
2,3,CP003822.1,1575141,3913174,4700744.5
3,4,CP003823.1,1084805,5488315,6030717.5
4,5,CP003824.1,1814975,6573120,7480607.5


In [3]:
## Load in previous crossover data
oldpath = '../../NOTES/CnCd-GTC-co-anova-df-min-haplotype-k6000-Jan2020.csv'
cncdco = pd.read_csv(oldpath)

## How many crosses are there?
crosses = sorted(cncdco.Cross.unique())
crosses

[1, 2, 3]

In [4]:
## View head
cncdco.head()

Unnamed: 0,Chrom,Chrlen,Cross,N,Seg
0,1,2291717,3,2.0,S11
1,1,2291717,3,3.0,S17
2,1,2291717,3,2.0,S18
3,1,2291717,3,2.0,S19
4,1,2291717,3,3.0,S2


In [5]:
## View tail
cncdco.tail()

Unnamed: 0,Chrom,Chrlen,Cross,N,Seg
1591,14,762695,2,3.0,SS-C242
1592,14,762695,2,4.0,SS-C271
1593,14,762695,2,1.0,SS-C272
1594,14,762695,2,4.0,SS-C290
1595,14,762695,2,2.0,SS-C291


In [6]:
## Set file paths, how many files?, view first 2
file_path_name = './FILES/CHR*/HAP/PMY*/*-k6000.csv'
file_paths = sorted(glob.glob(file_path_name))
len(file_paths),file_paths[:2]

(4508,
 ['./FILES/CHR1/HAP/PMY2556/PMY2556-min-haplotype-k6000.csv',
  './FILES/CHR1/HAP/PMY2558/PMY2558-min-haplotype-k6000.csv'])

In [7]:
## Set factor of new cross
newcross = np.max(crosses)+1

In [8]:
## Gather info for the new cross
coinfo = []
for i,j in enumerate(file_paths):
    
    
    chrom = int(j.split('/')[2].split('CHR')[-1]) 
    
    chrlen = chrommap.loc[chrom-1,'Length']
    
    seg = j.split('/')[4]
    
    temp = pd.read_csv(j)
    
    N = float(temp.shape[0]-1)
    
    
    coinfo.append((chrom,chrlen,newcross,N,seg))
    
coinfo = pd.DataFrame(coinfo,columns=cncdco.columns)
coinfo.tail()

Unnamed: 0,Chrom,Chrlen,Cross,N,Seg
4503,9,1186808,4,1.0,PMY2936
4504,9,1186808,4,1.0,PMY2937
4505,9,1186808,4,1.0,PMY2938
4506,9,1186808,4,1.0,PMY2939
4507,9,1186808,4,3.0,PMY2940


In [9]:
## check the parnets Bt22 and Ftc555-1
parent_pmy = ['PMY2649', 'PMY2650']
parent_ix = coinfo[(coinfo.Seg.isin(parent_pmy))].index

assert coinfo.loc[parent_ix,:].N.sum() == 0

coinfo.loc[parent_ix,:].sort_values(['Chrom','Seg']).head()

Unnamed: 0,Chrom,Chrlen,Cross,N,Seg
65,1,2291499,4,0.0,PMY2649
66,1,2291499,4,0.0,PMY2650
1997,2,1621675,4,0.0,PMY2649
1998,2,1621675,4,0.0,PMY2650
2319,3,1575141,4,0.0,PMY2649


In [10]:
## Drop parents
if np.min(parent_ix) in coinfo.index.tolist():
    
    coinfo.drop(parent_ix,axis=0,inplace=True)

In [11]:
## Append CO to old co dataframe 
newco = pd.concat([cncdco,coinfo],axis=0)
assert len(newco.Cross.unique()) == len(crosses) + 1

In [12]:
## add species vector
species = []

for i,j in newco.iterrows():
    
    seg = j.Seg
    
    if (len(seg.split('S-'))==2) and (seg[0] == 'S'):
        spec = 'cd'
    elif  (len(seg.split('S-'))==1) and (seg[0] == 'S'):
        spec = 'cn'
    elif seg[:3] == 'PMY':
        spec = 'cn'
    else:
        spec = -1
    species.append(spec)
        
newco['Species'] = species
newco.head()

Unnamed: 0,Chrom,Chrlen,Cross,N,Seg,Species
0,1,2291717,3,2.0,S11,cn
1,1,2291717,3,3.0,S17,cn
2,1,2291717,3,2.0,S18,cn
3,1,2291717,3,2.0,S19,cn
4,1,2291717,3,3.0,S2,cn


In [13]:
## Check work, view tial
assert len(newco.Species.unique()) == 2
assert -1 not in newco.Species.tolist()
assert newco.shape[0] - cncdco.shape[0] - len(file_paths) + len(parent_ix) == 0
newco.tail()

Unnamed: 0,Chrom,Chrlen,Cross,N,Seg,Species
4503,9,1186808,4,1.0,PMY2936,cn
4504,9,1186808,4,1.0,PMY2937,cn
4505,9,1186808,4,1.0,PMY2938,cn
4506,9,1186808,4,1.0,PMY2939,cn
4507,9,1186808,4,3.0,PMY2940,cn


In [15]:
## Save data
newco.to_csv('../../NOTES/CnCd-GTC-co-anova-df-min-haplotype-k6000-Jan2021.csv',
             index=False)