# Source: Soybase 
Data for 20,087 G. max and G. soja accessions genotyped with 42,509 SNPs (Wm82.a2)

scikit-allel is a Python package intended to enable exploratory analysis of large-scale genetic variation

# Import the libraries

In [1]:
import numpy as np
import scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
sns.set_context('notebook')
import h5py  # hdf5 file 
import allel;  # # import scikit-allel
print('scikit-allel', allel.__version__) # check which version is installed

scikit-allel 1.3.5


# Read vcf file using scikit-allel read_vcf()

In [2]:
# import scikit-allel
import allel
# check which version is installed
print(allel.__version__)

1.3.5


In [3]:
callset = allel.read_vcf('soysnp50k_wm82.a2_41317.vcf.gz')

In [4]:
sorted(callset.keys())

['calldata/GT',
 'samples',
 'variants/ALT',
 'variants/CHROM',
 'variants/FILTER_PASS',
 'variants/ID',
 'variants/POS',
 'variants/QUAL',
 'variants/REF']

In [7]:
# Access callset

#for key, value in callset.items():
#    print(key, value)

Arrays with keys beginning ‘calldata/’ come from the sample fields; keys beginning 'variants/' are coming from SNP fileds

# What's the datatype of each key (1d or 2d or 3d array)?

In [8]:
print('Genotypes:',callset['calldata/GT'].shape)
print('samples:',callset['samples'].shape)
print('Alt_allele:',callset['variants/ALT'].shape)
print('Chr #:',callset['variants/CHROM'].shape)
print('Filter:',callset['variants/FILTER_PASS'].shape)
print('SNP:',callset['variants/ID'].shape)
print('PhyPos:',callset['variants/POS'].shape)
print('Quality:','Qual:',callset['variants/QUAL'].shape)
print('Ref_allele:',callset['variants/REF'].shape)

Genotypes: (42195, 20087, 2)
samples: (20087,)
Alt_allele: (42195, 3)
Chr #: (42195,)
Filter: (42195,)
SNP: (42195,)
PhyPos: (42195,)
Quality: Qual: (42195,)
Ref_allele: (42195,)


# What values each key contains?

In [9]:
geno=callset['calldata/GT'] 
geno[:1]

array([[[ 1,  1],
        [ 0,  0],
        [-1, -1],
        ...,
        [ 0,  0],
        [ 0,  0],
        [ 0,  1]]], dtype=int8)

In [10]:
# ‘samples’ array contains samples (extracted from the header line in the VCF file)
samples=callset['samples']
samples

array(['PI86046', 'PI90208', 'PI219698', ..., 'PI587906', 'PI587946',
       'PI603516'], dtype=object)

In [11]:
Alt_allele=callset['variants/ALT']
Alt_allele

array([['G', '', ''],
       ['T', '', ''],
       ['G', '', ''],
       ...,
       ['T', '', ''],
       ['T', '', ''],
       ['A', '', '']], dtype=object)

In [12]:
Chr_num=callset['variants/CHROM']
Chr_num

array(['Chr01', 'Chr01', 'Chr01', ..., 'scaffold_759', 'scaffold_843',
       'scaffold_91'], dtype=object)

In [13]:
Filter=callset['variants/FILTER_PASS']
Filter

array([False, False, False, ..., False, False, False])

In [14]:
SNP=callset['variants/ID']
SNP

array(['ss715578788', 'ss715578818', 'ss715578923', ..., 'ss715623959',
       'ss715584606', 'ss715633065'], dtype=object)

In [15]:
Phy_pos=callset['variants/POS']
Phy_pos

array([24952, 26003, 29671, ...,  4974,  3015, 24874])

In [16]:
Quality=callset['variants/QUAL']
Quality

array([nan, nan, nan, ..., nan, nan, nan], dtype=float32)

In [17]:
Ref_allele=callset['variants/REF']
Ref_allele

array(['A', 'C', 'A', ..., 'C', 'C', 'G'], dtype=object)

# vcf_to_dataframe

The vcf_to_dataframe() function extracts all data except samples and genotyping calls from a VCF and loads into a df.

In [97]:
df1 = allel.vcf_to_dataframe('soysnp50k_wm82.a2_41317.vcf.gz')
df1.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,Chr01,24952,ss715578788,A,G,,,,False
1,Chr01,26003,ss715578818,C,T,,,,False
2,Chr01,29671,ss715578923,A,G,,,,False
3,Chr01,30712,ss715578960,G,A,,,,False
4,Chr01,37018,ss715579193,C,T,,,,False


In [98]:
df1.columns

Index(['CHROM', 'POS', 'ID', 'REF', 'ALT_1', 'ALT_2', 'ALT_3', 'QUAL',
       'FILTER_PASS'],
      dtype='object')

# Required columns from df1

In [106]:
df1_data=df1[['ID','CHROM', 'POS', 'REF', 'ALT_1']]
df1_data.head()

Unnamed: 0,ID,CHROM,POS,REF,ALT_1
0,ss715578788,Chr01,24952,A,G
1,ss715578818,Chr01,26003,C,T
2,ss715578923,Chr01,29671,A,G
3,ss715578960,Chr01,30712,G,A
4,ss715579193,Chr01,37018,C,T


## Data Wrangling or modification

Since data type of each key values are different (1d, 2d, 3d arrays); converting all into 2D arrays for easy concatenation

### Convert Genotypes 3D to 2D array

scikit-allel has GenotypeArray() class, which adds some convenient functionality to an array of genotype calls. 

In [18]:
# Genotype array
genotypes=allel.GenotypeArray(geno)
genotypes

Unnamed: 0,0,1,2,3,4,...,20082,20083,20084,20085,20086,Unnamed: 12
0,1/1,0/0,./.,0/0,0/0,...,0/0,0/0,0/0,0/0,0/1,
1,1/1,0/0,./.,0/0,0/0,...,0/0,0/0,0/0,0/0,0/1,
2,1/1,0/0,0/1,0/0,0/0,...,0/0,0/0,0/0,0/0,0/1,
...,...,...,...,...,...,...,...,...,...,...,...,...
42192,0/0,0/0,0/0,0/0,1/1,...,0/0,0/0,0/0,1/1,./.,
42193,0/0,1/1,1/1,1/1,1/1,...,1/1,0/0,1/1,0/1,1/1,
42194,0/0,0/0,0/0,0/0,1/1,...,0/0,0/0,0/0,./.,0/0,


##### Allele count, i.e., count the number times each allele (0=reference, 1=first alternate, 2=second alternate, etc.) is observed for each variant

Note: notice that we have only 1 alternate allele

In [19]:
ac = genotypes.count_alleles()
ac

Unnamed: 0,0,1,Unnamed: 3
0,30113,9903,
1,29867,9835,
2,27974,11918,
...,...,...,...
42192,32071,7905,
42193,10712,29000,
42194,33638,4566,


### Reshape genotype array to view it as haplotypes by dropping the ploidy dimension

seems to_haplotypes() generates hapotype arrays with doubled the number of samples (from 20087 to 40174); each value (1/1) is split in to two columns (1 and 1). 

In [20]:
haps=genotypes.to_haplotypes() 
haps

Unnamed: 0,0,1,2,3,4,...,40169,40170,40171,40172,40173,Unnamed: 12
0,1,1,0,0,.,...,0,0,0,0,1,
1,1,1,0,0,.,...,0,0,0,0,1,
2,1,1,0,0,0,...,0,0,0,0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...
42192,0,0,0,0,0,...,0,1,1,.,.,
42193,0,0,1,1,1,...,1,0,1,1,1,
42194,0,0,0,0,0,...,0,.,.,0,0,


In [76]:
# create dataframe of haplotypes by transposing the array (with out transposing, memory is too much and getting error)
haps_df=pd.DataFrame(haps.T)
haps_df.head()

# rows are samples, columns are variants/SNPs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42185,42186,42187,42188,42189,42190,42191,42192,42193,42194
0,1,1,1,0,0,0,1,0,1,1,...,0,0,0,1,1,1,1,0,0,0
1,1,1,1,0,0,0,1,0,1,1,...,0,0,0,1,1,1,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
4,-1,-1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0


In [25]:
# combining allele call for every two rows to reduce samples from 40174 to 20087
result = haps_df.groupby(np.arange(len(haps_df))//2).sum()
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42185,42186,42187,42188,42189,42190,42191,42192,42193,42194
0,2,2,2,0,0,0,2,0,2,2,...,0,0,0,2,2,2,2,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,2,2,0,2,0
2,-2,-2,1,0,1,0,1,1,1,1,...,0,0,0,2,0,2,2,0,2,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,2,2,0,2,0
4,0,0,0,0,0,0,0,0,-2,0,...,0,0,0,2,0,2,2,2,2,2


Genotyping data calls: 0 reference allele, 2 alternate allele, 1 het, -2 consider as missing

In [88]:
# transpose the dataframe to keep samples as columns and SNPs as rows
result_df=result.T
result_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20077,20078,20079,20080,20081,20082,20083,20084,20085,20086
0,2,0,-2,0,0,0,0,0,2,0,...,0,0,2,0,0,0,0,0,0,1
1,2,0,-2,0,0,0,0,0,2,0,...,0,0,2,0,0,0,0,0,0,1
2,2,0,1,0,0,0,0,0,2,0,...,0,0,2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,2,0,...,0,0,2,0,0,0,0,0,0,-2


# Add sample names as column headers to result_df

### samples array (header line from VCF file)

In [92]:
# sampels array
samples=callset['samples']
print(samples.size)
samples

20087


array(['PI86046', 'PI90208', 'PI219698', ..., 'PI587906', 'PI587946',
       'PI603516'], dtype=object)

In [125]:
# convert samples array to dataframe
df2 = pd.DataFrame(data = samples)
#df2.columns=['GEName']
df2.head()

Unnamed: 0,0
0,PI86046
1,PI90208
2,PI219698
3,PI253651A
4,PI347550A


In [126]:
# transpose samples dataframe
samples_df=df2.T
samples_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20077,20078,20079,20080,20081,20082,20083,20084,20085,20086
0,PI86046,PI90208,PI219698,PI253651A,PI347550A,PI398807,PI408055A,PI408069,PI408169A,PI408169B,...,PI574480B,PI578360,PI578362,PI639693,PI657626,PI634759,PI423967,PI587906,PI587946,PI603516


In [127]:
# column headers of dataframe
samples_df.columns = samples_df.iloc[0]
samples_df

Unnamed: 0,PI86046,PI90208,PI219698,PI253651A,PI347550A,PI398807,PI408055A,PI408069,PI408169A,PI408169B,...,PI574480B,PI578360,PI578362,PI639693,PI657626,PI634759,PI423967,PI587906,PI587946,PI603516
0,PI86046,PI90208,PI219698,PI253651A,PI347550A,PI398807,PI408055A,PI408069,PI408169A,PI408169B,...,PI574480B,PI578360,PI578362,PI639693,PI657626,PI634759,PI423967,PI587906,PI587946,PI603516


### Add sample names as column headers

In [129]:
genotypes_df=pd.DataFrame(data=result_df.values, columns=samples_df.iloc[0])
genotypes_df.head()

Unnamed: 0,PI86046,PI90208,PI219698,PI253651A,PI347550A,PI398807,PI408055A,PI408069,PI408169A,PI408169B,...,PI574480B,PI578360,PI578362,PI639693,PI657626,PI634759,PI423967,PI587906,PI587946,PI603516
0,2,0,-2,0,0,0,0,0,2,0,...,0,0,2,0,0,0,0,0,0,1
1,2,0,-2,0,0,0,0,0,2,0,...,0,0,2,0,0,0,0,0,0,1
2,2,0,1,0,0,0,0,0,2,0,...,0,0,2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,2,0,...,0,0,2,0,0,0,0,0,0,-2


#### Genotyping data calls: 0 reference allele, 2 alternate allele, 1 het, -2 consider as missing



In [132]:
# count unique values of row
genotypes_df.iloc[1,:].value_counts()

 0    14761
 2     4745
 1      345
-2      236
Name: 1, dtype: int64

# QC genotyping data

### Do value_counts on each row, make some columns whose values are counts of each value

In [162]:
counts=genotypes_df.apply(pd.Series.value_counts, axis=1).fillna(0)
counts.head()

Unnamed: 0,-2,0,1,2
0,79.0,14963.0,187.0,4858.0
1,236.0,14761.0,345.0,4745.0
2,141.0,13915.0,144.0,5887.0
3,155.0,18053.0,54.0,1825.0
4,59.0,17246.0,174.0,2608.0


In [161]:
#counts=counts.rename(columns=data.rename(columns={'gdp':'log(gdp)'}, inplace=True))

AttributeError: 'DataFrame' object has no attribute 'dtype'

In [158]:
#counts_df=pd.DataFrame(counts, columns=['Missing', 'Ref_allele', 'Het', 'Alt_allele'])
#counts.head()

Unnamed: 0,-2,0,1,2
0,79.0,14963.0,187.0,4858.0
1,236.0,14761.0,345.0,4745.0
2,141.0,13915.0,144.0,5887.0
3,155.0,18053.0,54.0,1825.0
4,59.0,17246.0,174.0,2608.0


In [159]:
counts_df.columns

Index(['Missing', 'Ref_allele', 'Het', 'Alt_allele'], dtype='object')

In [160]:
counts_df.head()

Unnamed: 0,Missing,Ref_allele,Het,Alt_allele
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,


In [None]:
So far, I have samples as columns and SNPs as rows, then add other features. 

# Bring Minor allele frequency dataset

In [115]:
MAF=pd.read_csv('MAF_soy50K_SNPs.csv')
MAF.drop('maf_value.1', axis=1, inplace=True)
MAF=MAF.iloc[:,1:]
MAF.head()

Unnamed: 0,snp_name,dbSNP_ID,maf_type,maf_value,minor_allele
0,BARC_1.01_Gm01_2033_G_A,ss715578672,MAF in landrace,nd,nd
1,BARC_1.01_Gm01_2033_G_A,ss715578672,MAF in elite,nd,nd
2,BARC_1.01_Gm01_2033_G_A,ss715578672,MAF in G. max,nd,nd
3,BARC_1.01_Gm01_2033_G_A,ss715578672,MAF in G. soja,nd,nd
4,BARC_1.01_Gm01_2033_G_A,ss715578672,"MAF in landrace, elite & G. soja",nd,nd


In [118]:
# Rename columns names
MAF.rename(columns={'dbSNP_ID':'ID', 'snp_name':'SNP_Name', 'maf_type':'MAF_type','maf_value':'MAF_value', 'minor_allele':'Minor_Allele'}, inplace=True)
MAF.columns

Index(['SNP_Name', 'ID', 'MAF_type', 'MAF_value', 'Minor_Allele'], dtype='object')

In [119]:
MAF=MAF[['ID','SNP_Name', 'MAF_type', 'MAF_value', 'Minor_Allele']]
MAF.head()

Unnamed: 0,ID,SNP_Name,MAF_type,MAF_value,Minor_Allele
0,ss715578672,BARC_1.01_Gm01_2033_G_A,MAF in landrace,nd,nd
1,ss715578672,BARC_1.01_Gm01_2033_G_A,MAF in elite,nd,nd
2,ss715578672,BARC_1.01_Gm01_2033_G_A,MAF in G. max,nd,nd
3,ss715578672,BARC_1.01_Gm01_2033_G_A,MAF in G. soja,nd,nd
4,ss715578672,BARC_1.01_Gm01_2033_G_A,"MAF in landrace, elite & G. soja",nd,nd


# Combine df1_data and MAF dataframes

In [120]:
merge=df1_data.merge(MAF, how='inner', left_on='ID', right_on='ID')
merge.head()

Unnamed: 0,ID,CHROM,POS,REF,ALT_1,SNP_Name,MAF_type,MAF_value,Minor_Allele
0,ss715578788,Chr01,24952,A,G,BARC_1.01_Gm01_24939_A_G,MAF in landrace,0.335,G
1,ss715578788,Chr01,24952,A,G,BARC_1.01_Gm01_24939_A_G,MAF in elite,0.078,G
2,ss715578788,Chr01,24952,A,G,BARC_1.01_Gm01_24939_A_G,MAF in G. max,0.199,G
3,ss715578788,Chr01,24952,A,G,BARC_1.01_Gm01_24939_A_G,MAF in G. soja,0.167,G
4,ss715578788,Chr01,24952,A,G,BARC_1.01_Gm01_24939_A_G,"MAF in landrace, elite & G. soja",0.196,G


In [None]:
# concatenate genotypes_df, merge dataframes as final dataset............
dataset=pd.concat([df1_data, genotypes_df])

# Convert genotypes dataframe to 2D array

In [28]:
genotype_array=result_df.values
genotype_array

array([[ 2,  0, -2, ...,  0,  0,  1],
       [ 2,  0, -2, ...,  0,  0,  1],
       [ 2,  0,  1, ...,  0,  0,  1],
       ...,
       [ 0,  0,  0, ...,  0,  2, -2],
       [ 0,  2,  2, ...,  2,  1,  2],
       [ 0,  0,  0, ...,  0, -2,  0]], dtype=int8)

In [30]:
print('Genotypes array:',genotype_array.shape)

Genotypes array: (42195, 20087)


In [33]:
new_hap_array=map(sum, haps)
new_hap_array

<map at 0x1d147cbb2c8>

In [12]:
print('Alt_allele:',callset['variants/ALT'][:,0])

Alt_allele: ['G' 'T' 'G' ... 'T' 'T' 'A']


# genotype array from VCF file

-1 to indicate a missing value; 0=reference, 1=alternate allele

In [10]:
geno=callset['calldata/GT'] 
geno[:1]

array([[[ 1,  1],
        [ 0,  0],
        [-1, -1],
        ...,
        [ 0,  0],
        [ 0,  0],
        [ 0,  1]]], dtype=int8)

In [11]:
# Genotype array
genotypes=allel.GenotypeArray(geno)
genotypes

Unnamed: 0,0,1,2,3,4,...,20082,20083,20084,20085,20086,Unnamed: 12
0,1/1,0/0,./.,0/0,0/0,...,0/0,0/0,0/0,0/0,0/1,
1,1/1,0/0,./.,0/0,0/0,...,0/0,0/0,0/0,0/0,0/1,
2,1/1,0/0,0/1,0/0,0/0,...,0/0,0/0,0/0,0/0,0/1,
...,...,...,...,...,...,...,...,...,...,...,...,...
42192,0/0,0/0,0/0,0/0,1/1,...,0/0,0/0,0/0,1/1,./.,
42193,0/0,1/1,1/1,1/1,1/1,...,1/1,0/0,1/1,0/1,1/1,
42194,0/0,0/0,0/0,0/0,1/1,...,0/0,0/0,0/0,./.,0/0,


### Reshape genotype array to view it as haplotypes by dropping the ploidy dimension

seems to_haplotypes() generates hapotype arrays with doubled the number of samples (from 20087 to 40174); each value (1/1) is split in to two columns (1 and 1). 

In [12]:
haps=genotypes.to_haplotypes() 
haps

Unnamed: 0,0,1,2,3,4,...,40169,40170,40171,40172,40173,Unnamed: 12
0,1,1,0,0,.,...,0,0,0,0,1,
1,1,1,0,0,.,...,0,0,0,0,1,
2,1,1,0,0,0,...,0,0,0,0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...
42192,0,0,0,0,0,...,0,1,1,.,.,
42193,0,0,1,1,1,...,1,0,1,1,1,
42194,0,0,0,0,0,...,0,.,.,0,0,


In [34]:
# create dataframe of haplotypes by transposing the array (with out transposing, memory is too much and getting error)
haps_df=pd.DataFrame(haps.T)
haps_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42185,42186,42187,42188,42189,42190,42191,42192,42193,42194
0,1,1,1,0,0,0,1,0,1,1,...,0,0,0,1,1,1,1,0,0,0
1,1,1,1,0,0,0,1,0,1,1,...,0,0,0,1,1,1,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
4,-1,-1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0


In [13]:
#print('dtype:', haps.dtype)
#print('shape:',haps.shape)
#print('No of assays/rows:',haps.n_variants)
#print('No of samples/columns:', haps.n_haplotypes)

In [14]:
# Allele calls for a single variant at all haplotypes/samples
#haps[1]

In [15]:
#A single haplotype/sample can be obtained by indexing the second dimension
#haps[:, 1]

In [16]:
# allele call for a single haplotype/sample at a single variant 
#haps[1, 0]

In [17]:
# Reshape a haplotype array as diploid genotypes
#haps.to_genotypes(ploidy=2) # i.e reverting back to GenotypeArray

In [None]:
# create dataframe of haplotypes by transposing the array (with out transposing, memory is too much and getting error)
haps_df=pd.DataFrame(haps.T)
haps_df.head()  # so markers turned as columns; GE's as rows

In [None]:
result = haps_df1.groupby(np.arange(len(haps_df1))//2).sum()
result

In [19]:
haps_df.shape

(40174, 42195)

In [None]:
haps_df.to_csv('input_data.csv.gz')

In [None]:
df_iterator = pd.read_csv('input_data.csv.gz', chunksize=1000, compression='gzip')

for i, df_chunk in enumerate(df_iterator):

    df_chunk.groupby(np.arange(len(df_chunk))//2).sum()
    #do_something(df_chunk)
    
    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    
    # Add header if it is the first chunk
    header = i == 0

    df_chunk.to_csv(
        "dst_data.csv.gz",
        index=False,  # Skip index column
        header=header, 
        mode=mode,
        compression='gzip')

In [None]:
dataset=pd.read_csv("dst_data.csv.gz")
dataset.head()

In [None]:
## because of memory issue, I am reading chunks of 1000 rows from haps_df a

In [None]:
from more_itertools import sliced
CHUNK_SIZE = 1000

index_slices = sliced(range(len(haps_df)), CHUNK_SIZE)

#data=[]
for index_slice in index_slices:
    chunk = haps_df.iloc[index_slice] # your dataframe chunk ready for use
    result = chunk.groupby(np.arange(len(chunk))//2).sum() 
    #data.append(result)
    result.to_csv('modified.csv')
    


In [None]:
chunked_df = pd.DataFrame(data)
chunked_df.head()

In [None]:
haps_df1=haps_df.head()
haps_df1

In [None]:
haps_df1.shape

In [None]:
result = haps_df1.groupby(np.arange(len(haps_df1))//2).sum()
result

In [None]:
combined_df=pd.concat([df1, samples_df])

# vcf_to_hdf5()

For large datasets, vcf to hdf5 is good; HDF5 file stored on disk.

In [None]:
#vcf_path='soysnp50k_wm82.a2_41317.vcf.gz'

In [None]:
allel.vcf_to_hdf5('soysnp50k_wm82.a2_41317.vcf.gz', 'soysnp50k_wm82.a2.h5', fields='*', overwrite=True)

In [None]:
df1.shape

In [None]:
callset_fn = 'soysnp50k_wm82.a2.h5'
callset = h5py.File(callset_fn, mode='r')
callset

In [None]:
callset.keys()

In [None]:
chrom = callset['variants/CHROM']
chrom[1:5]

In [None]:
pos = callset['variants/POS']
pos

In [None]:
# load all items into NumPy array
pos[1:3]

In [None]:
# load genotype calls into memory for second to fourth variants, all samples
gt = callset['calldata/GT']
gt


In [None]:
genotypes=allel.GenotypeArray(gt)
genotypes

In [None]:
geno_array=genotypes.reshape(genotypes.shape[0], genotypes.shape[1], genotypes.shape[2])
geno_array.shape

In [None]:
import pandas as pd
df = pd.DataFrame(geno_array)
#df.to_csv('50k_geno_calls.csv')
df.head()

# pick a chromosome to work

In [None]:
chrom = 'scaffold_759'

# Visualize variant density

Plot shows how many SNPs are there and how they are distributed along the chromosome

# Filtering

Drop any polymorphic SNP with rate of missing & het alleles >0.1 among the 19,648 soybean and wild soybean accessions. The het allele calls in the remaining loci were set as missing in the subsequent analysis


# Similarity analysis
Genetic similarity between pairs of genotypes among the 18,480 cultivated and among the 1168 wild accessions was calculated as the ratio of
the number of identical SNP allele calls and the total number of SNPs for
which allele calls were made for the pair

# Cluster analysis
Pair-wise distance among the accessions of 806 wild and 5396 landrace
soybeans was obtained based on the allelic dissimilarity of the 42,509
SNPs; the neighbor-joining tree was constructed

# LD analysis
LD was analyzed within the wild, landrace, and N. Am. cultivar
populations with 806, 5396, and 562 accessions, respectively. Only
the SNPs with minor allele frequency $5% were included for LD calculation and construction of haplotype blocks. Calculation of pairwise
LD (r2
) among SNPs and identification of haplotype blocks was based
upon SNPs within 1-Mb windows using the software PLINK (Purcell
et al. 2007).

In [None]:
# Dendrogam of wild and landrace genotypes from different countries.