In [258]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import chart_studio.plotly as py
import plotly.tools as tls

In [259]:
dna_data = "raw_data/DUVALGENOME.csv"

In [260]:
dna_data_df = pd.read_csv(dna_data, low_memory=False)
dna_data_df.head()

Unnamed: 0,rsid,chromosome,position,genotype,Unnamed: 4
0,rs4477212,1,82154,AA,
1,rs3094315,1,752566,AG,
2,rs3131972,1,752721,AG,
3,rs12124819,1,776546,--,
4,rs11240777,1,798959,AG,


In [261]:
clean_dna_df = dna_data_df[["rsid", "chromosome", "position", "genotype"]]
clean_dna_df.head()

Unnamed: 0,rsid,chromosome,position,genotype
0,rs4477212,1,82154,AA
1,rs3094315,1,752566,AG
2,rs3131972,1,752721,AG
3,rs12124819,1,776546,--
4,rs11240777,1,798959,AG


In [262]:
drop_na_df = clean_dna_df.dropna()

In [263]:
drop_na_df.count()

rsid          963038
chromosome    963038
position      963038
genotype      963038
dtype: int64

In [264]:
genotype = drop_na_df["genotype"].unique()
genotype

array(['AA', 'AG', '--', 'CC', 'CT', 'GG', 'GT', 'TT', 'AC', 'AT', 'CG',
       '5546709', '5546761', '5546825', '6112795', '6373124', '6378215',
       '380196', '7312592', 'II', 'DI', 'DD', 'G', 'A', 'C', 'T', 'I',
       'D'], dtype=object)

In [265]:
geno_count = drop_na_df["genotype"].value_counts()
geno_count

CC         172478
GG         172024
TT         147626
AA         147250
CT         109380
AG         108923
AC          25501
GT          25206
--          23220
C            7492
G            7347
T            6952
A            6931
CG           1067
II            696
AT            633
DD            138
I             106
D              34
DI             26
7312592         1
5546825         1
6373124         1
5546709         1
380196          1
6378215         1
6112795         1
5546761         1
Name: genotype, dtype: int64

In [266]:
select_geno = ['CC','GG', 'TT', 'AA', 'CT', 'AG', 'AC', 'GT', '--', 'C', 'G', 'T', 'A', 'CG', 'II', 'AT', 'DD', 'I', 'D', 'DI']
select_df = drop_na_df[drop_na_df['genotype'].isin(select_geno)]
select_df.head()

Unnamed: 0,rsid,chromosome,position,genotype
0,rs4477212,1,82154,AA
1,rs3094315,1,752566,AG
2,rs3131972,1,752721,AG
3,rs12124819,1,776546,--
4,rs11240777,1,798959,AG


In [267]:
geno_count = select_df["genotype"].value_counts()
geno_count

CC    172478
GG    172024
TT    147626
AA    147250
CT    109380
AG    108923
AC     25501
GT     25206
--     23220
C       7492
G       7347
T       6952
A       6931
CG      1067
II       696
AT       633
DD       138
I        106
D         34
DI        26
Name: genotype, dtype: int64

In [268]:
geno_full_count_df = pd.DataFrame({"Each Geno Count":geno_count, "GENOTYPE":select_geno})
geno_full_count_df

Unnamed: 0,Each Geno Count,GENOTYPE
CC,172478,CC
GG,172024,GG
TT,147626,TT
AA,147250,AA
CT,109380,CT
AG,108923,AG
AC,25501,AC
GT,25206,GT
--,23220,--
C,7492,C


In [269]:
top_8_geno = geno_full_count_df.iloc[0:8, :].index.to_list()
top_8_geno

['CC', 'GG', 'TT', 'AA', 'CT', 'AG', 'AC', 'GT']

In [270]:
# TOP 8 GENO DATAFRAME

top_eight_df = pd.DataFrame({"Each Geno Count":geno_count})
top_eight_df

Unnamed: 0,Each Geno Count
CC,172478
GG,172024
TT,147626
AA,147250
CT,109380
AG,108923
AC,25501
GT,25206
--,23220
C,7492


In [271]:
only_8_df = geno_full_count_df.iloc[0:8,:]
only_8_df

Unnamed: 0,Each Geno Count,GENOTYPE
CC,172478,CC
GG,172024,GG
TT,147626,TT
AA,147250,AA
CT,109380,CT
AG,108923,AG
AC,25501,AC
GT,25206,GT


In [280]:
# CREATE BAR CHART

ax = only_8_df.plot.bar(
    x='GENOTYPE', 
    y='Each Geno Count', 
    rot=35,
    color=['teal', 'darkviolet', 'palegreen', 'lime', 'gold', 'turquoise', 'plum', 'yellow'])

alpha=0.5

plt.title("TOP 8 GENOTYPES")
plt.xlabel("GENOTYPE")
plt.ylabel("OCCURANCE")

plt.show()

<IPython.core.display.Javascript object>

In [273]:
geno_full_count_df.dtypes

Each Geno Count     int64
GENOTYPE           object
dtype: object

In [274]:
# CREATE PIE CHART

#labels = ['CC', 'GG', 'TT', 'AA', 'CT', 'AG', 'AC', 'GT']

#sizes = [172478, 172024, 147626, 147250, 109380, 108923, 25501, 25206]

#colors = ['teal', 'darkviolet', 'palegreen', 'lime', 'gold', 'turquoise', 'plum', 'yellow']

#explode = (0, 0.1, 0, 0, 0.1, 0, 0, 0.1)

In [275]:
#plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct="%1.1f%%", shadow=True, startangle=140)
          
#plt.show()