# Generate arlequin input file

Input files are structured into two main sections with additional subsections that must
appear in the following order:

1) Profile section (mandatory)

2) Data section (mandatory)

    2a) Haplotype list (optional)

    2b) Distance matrices (optional)
    
    2c) Samples (mandatory)
    
    2d) Genetic structure (optional)
    
    2e) Mantel tests (optional)

In [1]:
entry=''

## Profile section

In [53]:
header="[Profile]\n"
title=''
nsamples=41
GenotypicData=0
LocusSeparator='WHITESPACE'
DataType='FREQUENCY'

################################
lines=[header,'\t\t'+'Title="'+title+'"','\t\tNbSamples='+str(nsamples),'\t\tGenotypicData='+str(GenotypicData),'\t\tLocusSeparator='+str(LocusSeparator),'\t\tDataType='+DataType]


profile_section='\n'.join(lines)
entry+=profile_section+'\n'
print(profile_section)

[Profile]

		Title=""
		NbSamples=41
		GenotypicData=0
		LocusSeparator=WHITESPACE
		DataType=FREQUENCY


## Data section

In [54]:
header_dat='[Data]\n'
entry+=header_dat+'\n'


### Distance matrix

In [55]:
import pandas as pd
import numpy as np

file='arrays_with_at_least_5percent_in_reads'
dist=pd.read_csv(file+'.dist',header=0,sep='\t')
   
header_dist='\t[[DistanceMatrix]]\n'
MatrixName="A given distance matrix"
MatrixSize= len(dist)
LabelPosition='LINE'
MatrixDatafile=file+'.dis'

### write triangular distance matrix
dist = dist.where(np.tril(np.ones(dist.shape)).astype(np.bool))
dist_str = ['\t'.join(map(str,list(filter(lambda x: np.isfinite(x), np_array))+[0])) for np_array in dist.values]
dist_str=['\t'.join(map(str,dist.columns))]+dist_str
dist_str
with open(MatrixDatafile, 'w') as the_file:
    the_file.write('\n'.join(dist_str))
    
#########################

lines=[header_dist,'\t\tMatrixName="'+MatrixName+'"','\t\tMatrixSize='+str(MatrixSize),'\t\tLabelPosition='+LabelPosition,'\t\tMatrixData=EXTERN "'+MatrixDatafile+'"']

dist_section='\n'.join(lines)
print(dist_section)
entry+=dist_section+'\n'


	[[DistanceMatrix]]

		MatrixName="A given distance matrix"
		MatrixSize=10
		LabelPosition=LINE
		MatrixData=EXTERN "arrays_with_at_least_5percent_in_reads.dis"


### Samples

In [67]:
file='arrays_with_at_least_5percent_in_reads'
pop=pd.read_csv(file+'.popcounts',header=0,sep='\t',index_col=0)

header_dist='\t[[Samples]]\n'

sample_section=header_dist+'\n'
for col in pop.columns:
    sample_section+='\t\tSampleName="'+col+'"'+'\n'+'\t\tSampleSize='+str(sum(pop[col][pop[col]>0]))+'\n\t\tSampleData= {\n'
#     values=(pop[col][pop[col]>0].to_string())+'\n}\n\n'
    subset=pop[col][pop[col]>0]
    values='\t\t\t'+'\n\t\t\t'.join([' '.join(map(str,[pop.index[i],subset.iloc[i]])) for i in range(len(subset))])

    sample_section+=values+'\n\t\t}\n\n'
    
print(sample_section)
entry+=sample_section+'\n'

	[[Samples]]

		SampleName="R01L-10-T"
		SampleSize=212
		SampleData= {
			0 208
			1 2
			2 1
			3 1
		}

		SampleName="R01L-12-T"
		SampleSize=48
		SampleData= {
			0 39
			1 2
			2 7
		}

		SampleName="R02H-7-T"
		SampleSize=254
		SampleData= {
			0 1
			1 240
			2 2
			3 2
			4 1
			5 3
			6 5
		}

		SampleName="R02H-8-T"
		SampleSize=87
		SampleData= {
			0 84
			1 1
			2 1
			3 1
		}

		SampleName="R02H-8-T_bis"
		SampleSize=170
		SampleData= {
			0 143
			1 22
			2 5
		}

		SampleName="R02H-9-T"
		SampleSize=127
		SampleData= {
			0 57
			1 1
			2 1
			3 3
			4 62
			5 3
		}

		SampleName="R07B-1-Ta"
		SampleSize=195
		SampleData= {
			0 125
			1 69
			2 1
		}

		SampleName="R07B-1-Tb"
		SampleSize=251
		SampleData= {
			0 159
			1 1
			2 88
			3 3
		}

		SampleName="R07B-1-Tc"
		SampleSize=306
		SampleData= {
			0 303
			1 3
		}

		SampleName="R07B-1-Td"
		SampleSize=263
		SampleData= {
			0 217
			1 28
			2 18
		}

		SampleName="R07B-2-Tb"
		SampleSize=128
		SampleData= {
			0

In [68]:
entry=profile_section+'\n'+header_dat+'\n'+dist_section+'\n\n'+sample_section+'\n'
with open(file+'.arptemplate', 'w') as the_file:
    the_file.write(entry)

## Structure

In [69]:
### Structure by Site
import shutil

meta=pd.read_csv('../CRISPR_metadata.txt',sep='\t')
meta['Sample']=['_'.join(val.split('_')[1:]) for val in meta['Sample'].values]
meta

what='Site'
header_structure='\t[[Structure]]\n'
StructureName='Structured by '+what
NbGroups=len(set(meta[what].values))
structure_section=header_structure+'\n'+'StructureName="'+StructureName+'"\n'+'NbGroups='+str(NbGroups)+'\n'

for grp in set(meta[what].values):
    list_of_samples=meta[meta[what]==grp].Sample.values
    group='Group={\n'+'\n'.join(['"'+sam+'"' for sam in list_of_samples])+'\n}\n'
    structure_section+=group
print(structure_section)

shutil.copy(file+'.arptemplate', file+'_'+what+'.arp')
with open(file+'_'+what+'.arp', 'a') as the_file:
    the_file.write(structure_section+'\n')

	[[Structure]]

StructureName="Structured by Site"
NbGroups=9
Group={
"R13L-3-T"
"R13L-4-T"
"R13L-5-T"
}
Group={
"R15H-1-T"
"R15H-2-T"
"R15H-2-T_bis"
"R15H-3-T"
"R15H-3-T_old"
"R15H-6-T"
"R15H-7-T"
}
Group={
"R14B-2-T"
"R14B-4-T"
}
Group={
"R01L-10-T"
"R01L-12-T"
}
Group={
"R16L-1-T"
"R16L-2-T"
"R16L-3-T"
"R16L-4-T"
"R16L-5-T"
}
Group={
"R02H-7-T"
"R02H-8-T_bis"
"R02H-8-T"
"R02H-9-T"
}
Group={
"R07B-1-Ta"
"R07B-1-Tb"
"R07B-1-Tc"
"R07B-1-Td"
"R07B-2-Tb"
"R07B-2-Tc"
"R07B-3-T"
}
Group={
"R09L-2-Ta"
"R09L-2-Tb"
"R09L-2-Tc"
"R09L-3-T"
"R09L-4-Ta"
"R09L-4-Tb"
"R09L-4-Tc"
}
Group={
"R08H-1-T"
"R08H-2-Ta"
"R08H-2-Tb"
"R08H-2-Tc"
"R08H-3-Ta"
"R08H-3-Tb"
"R08H-3-Tc"
}



In [64]:
### Structure by Region
import shutil

meta=pd.read_csv('../CRISPR_metadata.txt',sep='\t')
meta['Sample']=['_'.join(val.split('_')[1:]) for val in meta['Sample'].values]
meta

what='Region'
header_structure='\t[[Structure]]\n'
StructureName='Structured by '+what
NbGroups=len(set(meta[what].values))
structure_section=header_structure+'\n'+'StructureName="'+StructureName+'"\n'+'NbGroups='+str(NbGroups)+'\n'

for grp in set(meta[what].values):
    list_of_samples=meta[meta[what]==grp].Sample.values
    group='Group={\n'+'\n'.join(['"'+sam+'"' for sam in list_of_samples])+'\n}\n'
    structure_section+=group
print(structure_section)

shutil.copy(file+'.arptemplate', file+'_'+what+'.arp')
with open(file+'_'+what+'.arp', 'a') as the_file:
    the_file.write(structure_section+'\n')

	[[Structure]]

StructureName="Structured by Region"
NbGroups=3
Group={
"R13L-3-T"
"R13L-4-T"
"R13L-5-T"
"R14B-2-T"
"R14B-4-T"
}
Group={
"R01L-10-T"
"R01L-12-T"
"R02H-7-T"
"R02H-8-T_bis"
"R02H-8-T"
"R02H-9-T"
"R07B-1-Ta"
"R07B-1-Tb"
"R07B-1-Tc"
"R07B-1-Td"
"R07B-2-Tb"
"R07B-2-Tc"
"R07B-3-T"
"R08H-1-T"
"R08H-2-Ta"
"R08H-2-Tb"
"R08H-2-Tc"
"R08H-3-Ta"
"R08H-3-Tb"
"R08H-3-Tc"
"R09L-2-Ta"
"R09L-2-Tb"
"R09L-2-Tc"
"R09L-3-T"
"R09L-4-Ta"
"R09L-4-Tb"
"R09L-4-Tc"
}
Group={
"R15H-1-T"
"R15H-2-T"
"R15H-2-T_bis"
"R15H-3-T"
"R15H-3-T_old"
"R15H-6-T"
"R15H-7-T"
"R16L-1-T"
"R16L-2-T"
"R16L-3-T"
"R16L-4-T"
"R16L-5-T"
}



In [63]:
### Structure by Region
import shutil

meta=pd.read_csv('../CRISPR_metadata.txt',sep='\t')
meta['Sample']=['_'.join(val.split('_')[1:]) for val in meta['Sample'].values]
meta

what='Flow'
header_structure='\t[[Structure]]\n'
StructureName='Structured by '+what
NbGroups=len(set(meta[what].values))
structure_section=header_structure+'\n'+'StructureName="'+StructureName+'"\n'+'NbGroups='+str(NbGroups)+'\n'

for grp in set(meta[what].values):
    list_of_samples=meta[meta[what]==grp].Sample.values
    group='Group={\n'+'\n'.join(['"'+sam+'"' for sam in list_of_samples])+'\n}\n'
    structure_section+=group
print(structure_section)

shutil.copy(file+'.arptemplate', file+'_'+what+'.arp')
with open(file+'_'+what+'.arp', 'a') as the_file:
    the_file.write(structure_section+'\n')

	[[Structure]]

StructureName="Structured by Flow"
NbGroups=3
Group={
"R01L-10-T"
"R01L-12-T"
"R09L-2-Ta"
"R09L-2-Tb"
"R09L-2-Tc"
"R09L-3-T"
"R09L-4-Ta"
"R09L-4-Tb"
"R09L-4-Tc"
"R16L-1-T"
"R16L-2-T"
"R16L-3-T"
"R16L-4-T"
"R16L-5-T"
}
Group={
"R02H-7-T"
"R02H-8-T_bis"
"R02H-8-T"
"R02H-9-T"
"R08H-1-T"
"R08H-2-Ta"
"R08H-2-Tb"
"R08H-2-Tc"
"R08H-3-Ta"
"R08H-3-Tb"
"R08H-3-Tc"
"R13L-3-T"
"R13L-4-T"
"R13L-5-T"
"R15H-1-T"
"R15H-2-T"
"R15H-2-T_bis"
"R15H-3-T"
"R15H-3-T_old"
"R15H-6-T"
"R15H-7-T"
}
Group={
"R07B-1-Ta"
"R07B-1-Tb"
"R07B-1-Tc"
"R07B-1-Td"
"R07B-2-Tb"
"R07B-2-Tc"
"R07B-3-T"
"R14B-2-T"
"R14B-4-T"
}

