## Making a file that connects the Kruglyak-style segregant names to the names used in Jerison et al. 2017 and beyond (and checking it twice)

In [4]:
import pandas as pd
import numpy as np

In [20]:
krug_genos = pd.read_csv('../accessory_files/BYxRM_GenoData.csv')
krug_genos

Unnamed: 0,marker,A01_01,A01_02,A01_03,A01_04,A01_05,A01_06,A01_07,A01_08,A01_09,...,A11_87,A11_88,A11_89,A11_90,A11_91,A11_92,A11_93,A11_94,A11_95,A11_96
0,27915_chr01_27915_T_C,R,B,R,R,B,B,B,B,B,...,R,R,R,B,R,R,B,B,R,B
1,28323_chr01_28323_G_A,R,B,R,R,B,B,B,B,B,...,R,R,R,B,R,R,B,B,R,B
2,28652_chr01_28652_G_T,R,B,R,R,B,B,B,B,B,...,R,R,R,B,R,R,B,B,R,B
3,29667_chr01_29667_C_A,R,B,R,R,B,B,B,B,B,...,R,R,R,B,R,R,B,B,R,B
4,30756_chr01_30756_C_G,R,B,R,R,B,B,B,B,B,...,R,R,R,B,R,R,B,B,R,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11618,12052353_chr16_929518_C_T,B,R,B,R,R,B,R,B,R,...,B,R,R,B,R,B,B,R,R,R
11619,12052559_chr16_929724_A_G,B,R,B,R,R,B,R,B,R,...,B,R,R,B,R,B,B,R,R,R
11620,12053380_chr16_930545_A_T,B,R,R,R,R,B,R,B,R,...,B,R,R,B,R,B,B,R,R,R
11621,12054124_chr16_931289_T_C,B,R,R,R,R,B,R,B,R,...,B,R,R,B,R,B,B,R,R,R


In [40]:
def parse_line(line):
    comma_split = line.strip().split(',')
    return comma_split[0].split(';') + comma_split[1:]

with open('./elife-27167-supp1-v2_fromJerison2017.csv', 'r') as infile:
    header_line = parse_line(infile.readline())
    mat = []
    for line in infile:
        tmp = parse_line(line)
        mat.append([tmp[0]]+[int(i) for i in tmp[1:]])

jerison_genos = pd.DataFrame(mat, columns=header_line)
jerison_genos

Unnamed: 0,segregant,27915_chr01_27915_T_C,28652_chr01_28652_G_T,31059_chr01_31059_G_A,31636_chr01_31636_T_C,31756_chr01_31756_C_T,32884_chr01_32884_C_T,34171_chr01_34171_T_A,34944_chr01_34944_C_G,35566_chr01_35566_A_G,...,12042496_chr16_919661_G_A,12044470_chr16_921635_T_C,12044676_chr16_921841_G_A,12045128_chr16_922293_T_C,12045308_chr16_922473_T_C,12046925_chr16_924090_G_A,12049012_chr16_926177_C_T,12050738_chr16_927903_A_C,12050938_chr16_928103_C_T,12052353_chr16_929518_C_T
0,LK1-A02,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
1,LK1-A05,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
2,LK1-A06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,LK1-A07,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,LK1-A09,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,LK6-E01,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
226,LK6-E02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
227,LK6-E04,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
228,LK6-E11,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [15]:
def change_well_format(w):
    if '_' in w:
        plate = int(w[1:3])
        t = 'LK' + str(plate) + '-'
        n = int(w.split('_')[1])
        lets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
        l = lets[int(np.floor((n-1)/12))]
        return t + l + str(((n-1) % 12) + 1).zfill(2)
    else:
        return w

kruglyak_seg_names = [i for i in krug_genos if i != 'marker']
jerison_seg_names = [change_well_format(w) for w in kruglyak_seg_names]
# But not all of these were actually used in the jerison experiment:
for i in range(len(jerison_seg_names)):
    if jerison_seg_names[i] not in set(jerison_genos['segregant']):
        jerison_seg_names[i] = 'NA'
name_df = pd.DataFrame({'Kruglyak_name': kruglyak_seg_names, 'Jerison_name': jerison_seg_names})
name_df

Unnamed: 0,Kruglyak_name,Jerison_name
0,A01_01,
1,A01_02,LK1-A02
2,A01_03,
3,A01_04,
4,A01_05,LK1-A05
...,...,...
1003,A11_92,
1004,A11_93,
1005,A11_94,
1006,A11_95,


In [41]:
merger = krug_genos.set_index('marker').T.reset_index().rename(columns={'index': 'Kruglyak_name'}).merge(name_df, on='Kruglyak_name', how='left')
merger = merger.replace({'B': 0, 'R': 1})
merger = merger.merge(jerison_genos, left_on='Jerison_name', right_on='segregant')
merger

Unnamed: 0,Kruglyak_name,27915_chr01_27915_T_C_x,28323_chr01_28323_G_A,28652_chr01_28652_G_T_x,29667_chr01_29667_C_A,30756_chr01_30756_C_G,31059_chr01_31059_G_A_x,31213_chr01_31213_G_A,31636_chr01_31636_T_C_x,31756_chr01_31756_C_T_x,...,12042496_chr16_919661_G_A_y,12044470_chr16_921635_T_C_y,12044676_chr16_921841_G_A_y,12045128_chr16_922293_T_C_y,12045308_chr16_922473_T_C_y,12046925_chr16_924090_G_A_y,12049012_chr16_926177_C_T_y,12050738_chr16_927903_A_C_y,12050938_chr16_928103_C_T_y,12052353_chr16_929518_C_T_y
0,A01_02,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
1,A01_05,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
2,A01_06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,A01_07,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,A01_09,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,A06_49,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
226,A06_50,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
227,A06_52,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
228,A06_59,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [42]:
# shared columns
x_cols = [i for i in merger if '_x' in i]
y_cols = [i for i in merger if '_y' in i]
xs = [i[:-2] for i in x_cols]
ys = [i[:-2] for i in x_cols]
len(x_cols), len(y_cols), len([i for i in xs if i in ys])

(4561, 4561, 4561)

In [43]:
# check that these shared columns are the same
same = 0
for i in xs:
    if len(merger) == len(merger[merger[i+'_x']==merger[i+'_y']]):
        same += 1
        
print(same)

4561


### Great, so when we use my old well renaming code to rename the Kruglyak wells, it agrees with the already renamed jerison data

In [47]:
name_df.to_csv('Segregant_renaming.csv')