# **Advanced variant filtering**

In [1]:
from pyoskar.core import Oskar
from pyoskar.sql import *
from pyoskar.analysis import *
from pyspark.sql.functions import col, udf, count, explode, concat, when, expr
from pyspark.sql.functions import *

oskar = Oskar(spark)
df = oskar.load("/home/roldanx/appl/oskar/oskar-spark/src/test/resources/platinum_chr22.small.parquet")

## Hardy Weinberg

In [2]:
oskar.hardyWeinberg(df,"hgvauser@platinum:illumina_platinum").select("id", "HWE").show(10)

+---------------+--------------------+
|             id|                 HWE|
+---------------+--------------------+
|22:16054454:C:T|                 1.0|
|22:16065809:T:C|                 1.0|
|22:16077310:T:A|  0.9254727474972191|
|22:16080499:A:G|                 1.0|
|22:16084621:T:C|                 1.0|
|22:16091610:G:T|                 1.0|
|22:16096040:G:A|  0.4746014089729329|
|22:16099957:C:T|0.016007636455477054|
|22:16100462:A:G|0.001011008618240...|
|22:16105660:G:A|  0.3037449017426771|
+---------------+--------------------+
only showing top 10 rows



## Inbreeding coefficient

In [3]:
df2 = oskar.stats(df, studyId="hgvauser@platinum:illumina_platinum", missingAsReference=True)
oskar.inbreedingCoefficient(df2).show(10)

+--------+-------------------+-----------+------------------+--------------+
|SampleId|                  F|ObservedHom|       ExpectedHom|GenotypesCount|
+--------+-------------------+-----------+------------------+--------------+
| NA12877|-1.0857581722788996|         70|233.97577702999115|           385|
| NA12878|-1.1024114888695444|         69|244.65916746854782|           404|
| NA12879|-1.1890914293957586|         69| 247.7093403339386|           398|
| NA12880|-1.1013660394101679|         71|248.15224742889404|           409|
| NA12881|-1.1560267972581504|         65| 252.6643579006195|           415|
| NA12882|-1.0112382612189488|         76| 224.8269881606102|           372|
| NA12883|-1.0602574055431329|         67|229.62110525369644|           383|
| NA12884|-1.0340014363992485|         74|224.47404664754868|           370|
| NA12885|-1.1105665251221366|         78| 254.8010356426239|           414|
| NA12886| -1.067867784696387|         72|244.48096668720245|           406|

## Mendelian error

In [4]:
oskar.mendel(df, "NA12877", "NA12878", "NA12879").select("id", "mendelianError").filter(col("mendelianError") != "0").show()

+------------------+--------------+
|                id|mendelianError|
+------------------+--------------+
|22:19748211:CCCC:-|             1|
+------------------+--------------+

