# 3 Wrangling, EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 3.1 Load genotyping data file

In [2]:
sample_data= pd.read_csv('Soy50K_SNPs_filtered.csv', index_col=[0])
sample_data.head()

Unnamed: 0,ID,PI86046,PI90208,PI219698,PI253651A,PI347550A,PI398807,PI408055A,PI408069,PI408169A,...,PI423967,PI587906,PI587946,PI603516,Ref_allele,Alt_allele,Missing,perc_miss,MAF_Ref,MAF_Alt
0,ss715578788,2,0,-2,0,0,0,0,0,2,...,0,0,0,-2,14963,4858,266,1.324174,0.754906,0.245094
1,ss715578818,2,0,-2,0,0,0,0,0,2,...,0,0,0,-2,14761,4745,581,2.892274,0.756742,0.243258
2,ss715578923,2,0,-2,0,0,0,0,0,2,...,0,0,0,-2,13915,5887,285,1.418757,0.702707,0.297293
3,ss715578960,0,0,0,0,0,0,0,0,0,...,0,0,0,0,18053,1825,209,1.040422,0.90819,0.09181
4,ss715579193,0,0,-2,0,0,0,0,0,2,...,0,0,0,-2,17246,2608,233,1.159896,0.868641,0.131359


In [3]:
sample_data.shape

(36729, 20094)

In [4]:
samples= sample_data.drop(['Ref_allele','Alt_allele','Missing','perc_miss','MAF_Ref','MAF_Alt'], axis=1)
#samples.head()

In [5]:
# Transpose the dataframe
samples=samples.T
samples.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42184,42185,42186,42187,42188,42190,42191,42192,42193,42194
ID,ss715578788,ss715578818,ss715578923,ss715578960,ss715579193,ss715579265,ss715579576,ss715579942,ss715580755,ss715580822,...,ss715608834,ss715608835,ss715608836,ss715603654,ss715584705,ss715639176,ss715586962,ss715623959,ss715584606,ss715633065
PI86046,2,2,2,0,0,0,2,0,2,2,...,0,0,0,0,2,2,2,0,0,0
PI90208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,2,2,0,2,0
PI219698,-2,-2,-2,0,-2,0,-2,-2,-2,-2,...,0,0,0,0,2,2,2,0,2,0
PI253651A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,2,2,0,2,0


In [6]:
# convert row 1 as header
samples.columns = samples.iloc[0]
samples = samples[1:]
samples.head()

ID,ss715578788,ss715578818,ss715578923,ss715578960,ss715579193,ss715579265,ss715579576,ss715579942,ss715580755,ss715580822,...,ss715608834,ss715608835,ss715608836,ss715603654,ss715584705,ss715639176,ss715586962,ss715623959,ss715584606,ss715633065
PI86046,2,2,2,0,0,0,2,0,2,2,...,0,0,0,0,2,2,2,0,0,0
PI90208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,2,2,0,2,0
PI219698,-2,-2,-2,0,-2,0,-2,-2,-2,-2,...,0,0,0,0,2,2,2,0,2,0
PI253651A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,2,2,0,2,0
PI347550A,0,0,0,0,0,0,0,0,-2,0,...,0,0,0,0,2,2,2,2,2,2


In [7]:
samples.shape # 20087 samples are there

(20087, 36729)

In [8]:
# First filter out those rows which does not contain any data
samples_df = samples.dropna(how = 'all')
samples_df.shape

# so, there are no empty rows

(20087, 36729)

In [9]:
samples_df['Ref_allele'] = samples_df.apply(lambda row: sum(row[1:36729]==0) ,axis=1)
samples_df['Alt_allele'] = samples_df.apply(lambda row: sum(row[1:36729]==2) ,axis=1)
samples_df['Missing'] = samples_df.apply(lambda row: sum(row[1:36729]==-2) ,axis=1)
# create percent missing column by dividing missing value with total samples#
samples_df['perc_miss']=(samples_df['Missing']/20088)*100
samples_df.head()

ID,ss715578788,ss715578818,ss715578923,ss715578960,ss715579193,ss715579265,ss715579576,ss715579942,ss715580755,ss715580822,...,ss715584705,ss715639176,ss715586962,ss715623959,ss715584606,ss715633065,Ref_allele,Alt_allele,Missing,perc_miss
PI86046,2,2,2,0,0,0,2,0,2,2,...,2,2,2,0,0,0,17455,19005,268,1.33413
PI90208,0,0,0,0,0,0,0,0,0,0,...,2,2,2,0,2,0,21204,15452,72,0.358423
PI219698,-2,-2,-2,0,-2,0,-2,-2,-2,-2,...,2,2,2,0,2,0,15979,13053,7696,38.31143
PI253651A,0,0,0,0,0,0,0,0,0,0,...,2,2,2,0,2,0,20141,16513,74,0.368379
PI347550A,0,0,0,0,0,0,0,0,-2,0,...,2,2,2,2,2,2,19659,14670,2399,11.942453


In [11]:
# sort dataframe by Missing
samples_df.sort_values(by='Missing', ascending=False)

ID,ss715578788,ss715578818,ss715578923,ss715578960,ss715579193,ss715579265,ss715579576,ss715579942,ss715580755,ss715580822,...,ss715584705,ss715639176,ss715586962,ss715623959,ss715584606,ss715633065,Ref_allele,Alt_allele,Missing,perc_miss
PI189900,-2,-2,-2,0,-2,0,-2,-2,-2,-2,...,2,0,2,2,-2,0,15943,7617,13168,65.551573
PI567264A,-2,0,-2,0,-2,0,-2,-2,-2,-2,...,2,0,-2,-2,2,0,14225,9938,12565,62.549781
PI506737,0,0,-2,0,-2,0,0,0,2,-2,...,2,2,0,-2,-2,0,14398,10004,12326,61.360016
PI653862,-2,2,2,2,0,2,-2,0,2,2,...,-2,2,0,0,2,0,14066,10805,11857,59.025289
PI615446,-2,0,0,0,0,0,-2,-2,-2,-2,...,2,-2,2,-2,-2,0,14799,10474,11455,57.024094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PI547847,0,0,0,0,0,0,0,0,0,0,...,2,0,2,0,0,0,35968,754,6,0.029869
PI73585,0,0,0,0,0,0,0,0,0,0,...,2,2,2,0,0,2,23557,13166,5,0.024890
PI548242,0,0,0,0,0,0,0,0,0,0,...,2,0,2,0,0,0,35889,835,4,0.019912
PI612719,2,2,2,0,2,0,2,2,2,2,...,2,2,0,0,2,0,25655,11069,4,0.019912


In [13]:
samples_df.shape

(20087, 36733)

## Sample filtering by number of missing values

 ## Find how many samles having >10% missing data?

In [17]:
miss_samples=samples_df.loc[samples_df['Missing']>3673]
miss_samples.shape

# 691 samples having >10% missing 
# 138 samples with 15% missing
# 20 samples having >20% missing data

(691, 36733)

## Filter samples with <10% missing (i.e dropping samples wiht >10% missing)

In [18]:
df1=samples_df.loc[samples_df['Missing']<=3673] # 2008 is 10% of 20087
df1.shape

# the 12 SNPs with >10% missing data are dropped

(19396, 36733)

In [20]:
filtered_data=df1.drop(columns=['Ref_allele','Alt_allele','Missing','perc_miss'])
filtered_data.shape

(19396, 36729)

In [21]:
filtered_data.head()

ID,ss715578788,ss715578818,ss715578923,ss715578960,ss715579193,ss715579265,ss715579576,ss715579942,ss715580755,ss715580822,...,ss715608834,ss715608835,ss715608836,ss715603654,ss715584705,ss715639176,ss715586962,ss715623959,ss715584606,ss715633065
PI86046,2,2,2,0,0,0,2,0,2,2,...,0,0,0,0,2,2,2,0,0,0
PI90208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,2,2,0,2,0
PI253651A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,2,2,0,2,0
PI347550A,0,0,0,0,0,0,0,0,-2,0,...,0,0,0,0,2,2,2,2,2,2
PI398807,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,2,2,0,2,0


In [22]:
filtered_data.to_csv('samples_SNP_filtered.csv')

Summary:

Samples filtering: 
There 691 samples having >10% missing (138 samples with 15% missing, 20 samples having >20% missing data). Since there aren't many samples with higher missign data, I dropped samples with >10% missing data. 
After dropping samples with >10% missing data, 19396 samples are left.