In [2]:
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
## Bring in phenotype information
data = '../PHENOTYPE/202011_WGS_B3502Progeny_for_QTL.xlsx'
pheno = pd.read_excel(data)

## Set progeny, parent, and phenotype columns
pheno['Strain'] = [''.join(a.split("_"))[:3] for a in pheno[pheno.columns[0]]]
pheno['Parent'] = [ a.split('_')[0].split('B3502')[-1] for a in pheno['Cross_Progeny']]
pheno['Phenotype'] = pheno[pheno.columns[3]] 
pheno.drop(pheno.columns[3],axis=1,inplace=True)

## slice and view head
olddf = pheno[['Strain','Phenotype','Parent','Basidium']]
olddf.head()

Unnamed: 0,Strain,Phenotype,Parent,Basidium
0,A01,N,A1,1.0
1,A02,Y,A1,1.0
2,A03,N,A1,1.0
3,A04,L,A1,1.0
4,A05,N (with rare spots of T),A1,2.0


In [4]:
## Check pheontypes in old data
sorted(olddf.Phenotype.unique())

['L', 'N', 'N (with rare spots of T)', 'T', 'Y']

In [5]:
## Load in new data
newpath = '/home/croth/Self-Filamentation_B3502_progeny/202105_Samples_for_Illumina_ForCullen.xlsx'
pheno_df = pd.read_excel(newpath)

## set column
pheno_df['Phenotype'] = pheno_df[pheno_df.columns[-1]]

## Slice and view head
newdf = pheno_df[['Strain','Phenotype']]
newdf.head()

Unnamed: 0,Strain,Phenotype
0,NIH12,N
1,NIH433,N
2,I641,N
3,I643,N
4,I645,N


In [6]:
## print phenotyes
sorted(newdf.Phenotype.unique())

['L', 'N', 'N (with rare spots of T)', 'T']

In [7]:
parents = pd.DataFrame([['B3502_A1', 'B3502_B1', 'B3502_B7','CF830'],
                        ['Y','N','Y','N']],index=['Strain','Phenotype']).T
parents

Unnamed: 0,Strain,Phenotype
0,B3502_A1,Y
1,B3502_B1,N
2,B3502_B7,Y
3,CF830,N


In [8]:
## Concat dataframe
newpheno = pd.concat([olddf,newdf,parents]).reset_index(drop=True)
newpheno.head()

Unnamed: 0,Strain,Phenotype,Parent,Basidium
0,A01,N,A1,1.0
1,A02,Y,A1,1.0
2,A03,N,A1,1.0
3,A04,L,A1,1.0
4,A05,N (with rare spots of T),A1,2.0


In [9]:
newpheno.tail()

Unnamed: 0,Strain,Phenotype,Parent,Basidium
81,B_33,L,,
82,B3502_A1,Y,,
83,B3502_B1,N,,
84,B3502_B7,Y,,
85,CF830,N,,


In [10]:
newpheno.Strain.unique().shape

(86,)

In [11]:
newpheno.to_csv('../PHENOTYPE/Cellsize_shape_phenotype.csv',index=False)

In [10]:
## Add a phenotype column with a dictionarry
## Make a numeric map of phenotypes
unipheno = sorted(newpheno.Phenotype.unique())
unipheno

['L', 'N', 'N (with rare spots of T)', 'T', 'Y']

In [11]:
## Make dicitonary
npheno = np.array([3,0,0,2,1],dtype=float)
pdict = dict(zip(unipheno,npheno))
pdict

{'L': 3.0, 'N': 0.0, 'N (with rare spots of T)': 0.0, 'T': 2.0, 'Y': 1.0}