From cb670eef5b6ae558afd8bca523907676e699d712 Mon Sep 17 00:00:00 2001 From: Mamonu Date: Wed, 4 Nov 2020 13:40:09 +0000 Subject: [PATCH] some code review issues worked on. only tests left --- splink/diagnostics.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/splink/diagnostics.py b/splink/diagnostics.py index 02c07e2856..d13df3df9c 100644 --- a/splink/diagnostics.py +++ b/splink/diagnostics.py @@ -3,6 +3,8 @@ from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext +from typing import Callable + from pyspark.ml.regression import LinearRegression from pyspark.ml.linalg import DenseVector from pyspark.ml.linalg import Vectors @@ -11,13 +13,40 @@ from pyspark.sql.types import StructType, StructField, StringType, DoubleType, Row import pyspark.sql.functions as f from pyspark.sql.functions import when - - -def vif_gammas(inputdata, sparksession, sampleratio=1.0): +from .check_types import check_types + +@check_types +def vif_gammas(inputdata:DataFrame, spark : SparkSession, sampleratio: float =1.0): + + """splink diagnostic of multicollinearity in gamma values + + We want to check if the gammas of the input variables of the models we are using suffer from multicollinearity. + Since the columns information is transformed into gammas , and it is the information from the gammas + which the model 'sees', therefore it is corelation on the gammas which is potentially problematic. + + + + Args: + inputdata (DataFrame): scored comparisons Spark DataFrame + + spark (SparkSession): SparkSession object + + sampleratio (float,optional) : Fraction of rows to sample, range [0.0, 1.0]. + If 1.0 no sampling takes place + + + + Returns: + + (DataFrame) : Spark dataframe of the gamma variables Variance Inflation Factors (VIFs) + + + """ + collist = [] viflist = [] - sc = sparksession.sparkContext + sc = spark.sparkContext sqlContext = SQLContext(sc) dfvariables = inputdata.columns