Skip to content

Commit

Permalink
some code review issues worked on. only tests left
Browse files Browse the repository at this point in the history
  • Loading branch information
mamonu committed Nov 4, 2020
1 parent 6a54b16 commit cb670ee
Showing 1 changed file with 33 additions and 4 deletions.
37 changes: 33 additions & 4 deletions splink/diagnostics.py
Expand Up @@ -3,6 +3,8 @@
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext

from typing import Callable

from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import DenseVector
from pyspark.ml.linalg import Vectors
Expand All @@ -11,13 +13,40 @@
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, Row
import pyspark.sql.functions as f
from pyspark.sql.functions import when


def vif_gammas(inputdata, sparksession, sampleratio=1.0):
from .check_types import check_types

@check_types
def vif_gammas(inputdata:DataFrame, spark : SparkSession, sampleratio: float =1.0):

"""splink diagnostic of multicollinearity in gamma values
We want to check if the gammas of the input variables of the models we are using suffer from multicollinearity.
Since the columns information is transformed into gammas , and it is the information from the gammas
which the model 'sees', therefore it is corelation on the gammas which is potentially problematic.
Args:
inputdata (DataFrame): scored comparisons Spark DataFrame
spark (SparkSession): SparkSession object
sampleratio (float,optional) : Fraction of rows to sample, range [0.0, 1.0].
If 1.0 no sampling takes place
Returns:
(DataFrame) : Spark dataframe of the gamma variables Variance Inflation Factors (VIFs)
"""

collist = []
viflist = []

sc = sparksession.sparkContext
sc = spark.sparkContext
sqlContext = SQLContext(sc)

dfvariables = inputdata.columns
Expand Down

0 comments on commit cb670ee

Please sign in to comment.