## Imports

In [29]:
import os
import pandas as pd
import sqlite3
from snps import SNPs

In [35]:
dfs = {}
for i in os.listdir("ADHD"):
    if i in [".DS_store", ".ipynb_checkpoints"]:
        continue
    try:
        s = SNPs(f"ADHD/{i}")
    except UnicodeDecodeError:
        continue
    dfs[i] = s.snps


In [36]:
FILE = "db/ADHD_variant_genes.tsv"
df = pd.read_csv(FILE, sep='\t')
df.head()

Unnamed: 0,Chrom,Position,Ref_Base,Alt_Base,Note,Coding,Hugo,Sequence_Ontology,Protein_Change,Samples,Chrom.1,Position.1,Disease_Names,Phenotype,Global_AF,Chemical,Zygosity
0,chr1,43708410,G,A,,,ST3GAL3,intron,,60.23andme.27;616.23andme.288;8.23andme.2;865....,chr1,43946668,,,0.435,,het;hom;het;het
1,chr1,43712399,T,G,,,ST3GAL3,intron,,60.23andme.27;616.23andme.288;8.23andme.2,chr1,43950657,,,0.784,,hom;het;het
2,chr1,43729682,G,C,,,ST3GAL3,intron,,60.23andme.27;616.23andme.288;8.23andme.2;865....,chr1,43967940,,,,,hom;hom;het;het
3,chr1,43734985,C,A,,,ST3GAL3,intron,,60.23andme.27;616.23andme.288;8.23andme.2;865....,chr1,43973243,,,,,het;het;het;het
4,chr1,43734985,C,G,,,ST3GAL3,intron,,60.23andme.27;616.23andme.288;8.23andme.2;865....,chr1,43973243,,,,,het;het;het;het


## Variants

In [106]:
bigdf = pd.concat(dfs.values(), keys=dfs.keys())
rsids = """rs11420276
rs1222063
rs4858241
rs28411770
rs4916723
rs5886709
rs74760947
rs11591402
rs1427829
rs281324
rs212178""".split("\n")
keys = list(dfs.keys())

In [91]:
from collections import defaultdict

In [94]:
c = defaultdict(int)
for j in keys:
    for i in rsids:
        try:
            c[i] += 1
            print(bigdf.loc[j, i])
        except KeyError:
            c[i] = 0
            continue

chrom              5
pos         87854395
genotype          CC
Name: (678.23andme.txt, rs4916723), dtype: object
chrom              5
pos         87854395
genotype          CC
Name: (561.23andme.txt, rs4916723), dtype: object
chrom              5
pos         87854395
genotype          CC
Name: (genome_Susan_Loftus_v3_Full_20200212185613 2.txt, rs4916723), dtype: object
chrom              5
pos         87890151
genotype          AC
Name: (8.23andme.txt, rs4916723), dtype: object
chrom              5
pos         87890151
genotype          AA
Name: (60.23andme.txt, rs4916723), dtype: object
chrom              5
pos         87854395
genotype          AC
Name: (genome_Emma_Loftus_v3_Full_20200324162732.txt, rs4916723), dtype: object
chrom              5
pos         87890151
genotype          AA
Name: (865.23andme.txt, rs4916723), dtype: object
chrom              5
pos         87890151
genotype          AC
Name: (616.23andme.txt, rs4916723), dtype: object


In [95]:
c

defaultdict(int,
            {'rs11420276': 0,
             'rs1222063': 0,
             'rs4858241': 0,
             'rs28411770': 0,
             'rs4916723': 4,
             'rs5886709': 0,
             'rs74760947': 0,
             'rs11591402': 0,
             'rs1427829': 0,
             'rs281324': 0,
             'rs212178': 0})

## Genes

In [115]:
genes = """ST3GAL3
SPAG16
PCDH7
LINC00461
FOXP2
LINC01288
SORCS3
DUSP6
SEMA6D
DRD4
DAT1
DBH
MAOA
DRD5""".split("\n")
# df[df["Hugo"] == "FOXP2"]
df[df["Hugo"] == "FOXP2"].Samples.str.split(";", expand=True)
gdfs = {g: df[df["Hugo"] == g] for g in genes}

In [139]:
gdfs.keys()

dict_keys(['ST3GAL3', 'SPAG16', 'PCDH7', 'LINC00461', 'FOXP2', 'LINC01288', 'SORCS3', 'DUSP6', 'SEMA6D', 'DRD4', 'DAT1', 'DBH', 'MAOA', 'DRD5'])

In [142]:
for k, v in gdfs.items():
    _df = v[gdfs[k].Chrom == "chr7"]
    print(_df[(_df['Position.1'] >= 114446081) & (_df['Position.1'] <= 114446081)])

Empty DataFrame
Columns: [Chrom, Position, Ref_Base, Alt_Base, Note, Coding, Hugo, Sequence_Ontology, Protein_Change, Samples, Chrom.1, Position.1, Disease_Names, Phenotype, Global_AF, Chemical, Zygosity]
Index: []
Empty DataFrame
Columns: [Chrom, Position, Ref_Base, Alt_Base, Note, Coding, Hugo, Sequence_Ontology, Protein_Change, Samples, Chrom.1, Position.1, Disease_Names, Phenotype, Global_AF, Chemical, Zygosity]
Index: []
Empty DataFrame
Columns: [Chrom, Position, Ref_Base, Alt_Base, Note, Coding, Hugo, Sequence_Ontology, Protein_Change, Samples, Chrom.1, Position.1, Disease_Names, Phenotype, Global_AF, Chemical, Zygosity]
Index: []
Empty DataFrame
Columns: [Chrom, Position, Ref_Base, Alt_Base, Note, Coding, Hugo, Sequence_Ontology, Protein_Change, Samples, Chrom.1, Position.1, Disease_Names, Phenotype, Global_AF, Chemical, Zygosity]
Index: []
Empty DataFrame
Columns: [Chrom, Position, Ref_Base, Alt_Base, Note, Coding, Hugo, Sequence_Ontology, Protein_Change, Samples, Chrom.1, Posi

### variants in coding regions

In [157]:
coding = df[df.Coding.notna()]

In [158]:
len(coding)

58

In [173]:
count_df = coding.groupby("Hugo").count()
count_df.sort_values(by="Coding", ascending=False)

Unnamed: 0_level_0,Chrom,Position,Ref_Base,Alt_Base,Note,Coding,Sequence_Ontology,Protein_Change,Samples,Chrom.1,Position.1,Disease_Names,Phenotype,Global_AF,Chemical,Zygosity
Hugo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
DBH,13,13,13,13,0,13,13,13,13,13,13,1,0,1,0,13
MAOA,11,11,11,11,0,11,11,11,11,11,11,0,0,0,0,11
DRD4,8,8,8,8,0,8,8,8,8,8,8,0,0,1,0,8
DUSP6,6,6,6,6,0,6,6,6,6,6,6,0,0,0,0,6
SEMA6D,6,6,6,6,0,6,6,6,6,6,6,0,0,1,0,6
SPAG16,4,4,4,4,0,4,4,4,4,4,4,0,0,0,0,4
SORCS3,3,3,3,3,0,3,3,3,3,3,3,0,0,0,0,3
DRD5,2,2,2,2,0,2,2,2,2,2,2,0,0,0,0,2
PCDH7,2,2,2,2,0,2,2,2,2,2,2,0,0,0,0,2
ST3GAL3,2,2,2,2,0,2,2,2,2,2,2,0,0,0,0,2
