In [15]:
SQLContext.newSession(sqlContext)
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler,StandardScaler,RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.linalg import VectorUDT,Vectors
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import Window
from pyspark.ml import Pipeline
from pyspark.ml.regression import GeneralizedLinearRegression

import pandas as pd
import re
from tabulate import tabulate
import random
import sys
import numpy as np

In [39]:
#import data and rename bad name rank into vaerdiSlope
#RAW DATA!!! 


#exclude some of the variables, and cast all variables to double
excludeCols = ["medArb_"+str(i) for i in range(1,16)] # we don't need the medarbejders 
includeCols = [i for i in df.columns if i not in excludeCols]

rankCols = [re.sub(pattern="rank_",repl="vaerdiSlope_",string=i) for i in includeCols]
finalCols = [F.col(i) for i in includeCols[:2]]+["kortBeskrivelse"]+[F.col(i).cast("double") for i in includeCols[2:] if i not in ["kortBeskrivelse"]]


df = sqlContext.read.parquet("/home/svanhmic/workspace/Python/Erhvervs/data/cdata/featureDataCvr")
df.select(["cvrNummer"])
rankCols = [re.sub(pattern="rank_",repl="vaerdiSlope_",string=i) for i in includeCols ]
renamedDf = (df
             .select(*finalCols)
             .select([F.col(val).alias(rankCols[idx]) for idx,val in enumerate(includeCols)])
             .filter((F.col("kortBeskrivelse") == "APS") | (F.col("kortBeskrivelse") == "AS"))
             )
renamedDf.show()

+---------+--------------------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+-----------+-----------------+------------------+------------------+------------------+------------------+-------------+-------------+-------------+-------------+----------------+---------------+
|cvrNummer|              status|label|AarsVaerk_1|AarsVaerk_2|AarsVaerk_3|AarsVaerk_4|AarsVaerk_5|AarsVaerk_6|AarsVaerk_7|AarsVaerk_8|AarsVaerk_9|AarsVaerk_10|AarsVaerk_11|AarsVaerk_12|AarsVaerk_13|AarsVaerk_14|AarsVaerk_15|avgVarighed|totalAabneEnheder|totalLukketEnheder|     vaerdiSlope_1|     vaerdiSlope_2|     vaerdiSlope_3|vaerdiSlope_4|vaerdiSlope_5|vaerdiSlope_6|vaerdiSlope_7|reklamebeskyttet|kortBeskrivelse|
+---------+--------------------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+-

In [40]:
windowSpecRank =(Window.partitionBy(F.col("cvrNummer"))).orderBy(F.col("periode_gyldigFra").desc())
groupCols = ["cvrNummer","vaerdi"]

companyNameDf = (sqlContext
                 .read
                 .parquet("/home/svanhmic/workspace/Python/Erhvervs/data/cdata/companyCvrData")
                 .withColumn(colName="rank",col=F.rank().over(windowSpecRank))
                 .filter((F.col("rank")==1) & (F.col("sekvensnr")==0))
                 .select([F.col(i) for i in groupCols])
                 .withColumnRenamed(existing="vaerdi",new="navn")
                 .orderBy(F.col("cvrNummer"))
                 .cache()
                )

In [41]:
labelCols = ["navn","cvrNummer","label","status","kortBeskrivelse"]
featCols = [i for i in companyNameDf.columns+renamedDf.columns if i not in labelCols]

#get minimum values from each column
minCols = [F.min(i).alias(i) for i in featCols]
minValsRdd = renamedDf.groupby().agg(*minCols).rdd
broadcastedmin = sc.broadcast(minValsRdd.first().asDict())

#create array that subtracts minimum value in the numeric columns.
logColsSelected = [F.col(i).alias(i) for i in labelCols]+[(F.col(i)-F.lit(broadcastedmin.value[i])).alias(i) for i in featCols]

#takes log(x+1) to the numeric columns and fills the blanks with 0.0 
logDf = (renamedDf
         .join(companyNameDf,(companyNameDf["cvrNummer"]==renamedDf["cvrNummer"]),"inner")
         .drop(companyNameDf["cvrNummer"])
         .select(*logColsSelected)
         #.select([F.col(i).alias(i) for i in labelCols]+[F.log1p(F.col(i)).alias(i) for i in featCols])
         .distinct()
         .na
         .fill(0.0,featCols)
         .cache()
        )
logDf.show(4)

+--------------------+---------+-----+--------------------+---------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+-----------+-----------------+------------------+--------------------+-------------+-------------+-------------+-------------+-------------+-------------+----------------+
|                navn|cvrNummer|label|              status|kortBeskrivelse|AarsVaerk_1|AarsVaerk_2|AarsVaerk_3|AarsVaerk_4|AarsVaerk_5|AarsVaerk_6|AarsVaerk_7|AarsVaerk_8|AarsVaerk_9|AarsVaerk_10|AarsVaerk_11|AarsVaerk_12|AarsVaerk_13|AarsVaerk_14|AarsVaerk_15|avgVarighed|totalAabneEnheder|totalLukketEnheder|       vaerdiSlope_1|vaerdiSlope_2|vaerdiSlope_3|vaerdiSlope_4|vaerdiSlope_5|vaerdiSlope_6|vaerdiSlope_7|reklamebeskyttet|
+--------------------+---------+-----+--------------------+---------------+-----------+-----------+-----------+-----------+-----------

In [47]:
strs = ""
excludedCols = ["medArb_"+str(i) for i in range(1,16)]+["cvrNummer","label","status","navn","kortBeskrivelse"]
for i in logDf.columns:
    if i not in excludedCols:
        strs += i+" + "

#excludedCols    
imputedDf = logDf.fillna(value=0.0)
formula = RFormula(formula="label ~ "+strs[:-3],labelCol="label")

glr = GeneralizedLinearRegression(family="binomial", link="logit", maxIter=10, regParam=0.3)
standardScale = StandardScaler(withMean=True,withStd=True,inputCol=glr.getFeaturesCol(),outputCol="scaledFeatures")


pipeline = Pipeline(stages=[formula,standardScale,glr])

grid = (ParamGridBuilder()
        .baseOn({lr.predictionCol:"prediction"})
        .baseOn({lr.rawPredictionCol:"rawPrediction"})
        .baseOn({lr.probabilityCol:"probability"})
        .baseOn({lr.labelCol:"label"})
        .baseOn({lr.featuresCol:"features"})
        .addGrid(param=lr.elasticNetParam,values=[0.1,1.0])
        .addGrid(param=lr.getMaxIter,values=[10])
        .build()
       )

evaluate = BinaryClassificationEvaluator()

trainEvalModel = TrainValidationSplit(estimator=pipeline,estimatorParamMaps=grid,evaluator=evaluate,trainRatio=0.8)

In [48]:
cols = [i for i in logDf.columns if i not in excludedCols]+["label"]

model = pipeline.fit(imputedDf.select(*cols).filter(F.col("label") <= 1))

In [49]:
predict = model.transform(imputedDf.select(*cols).filter(F.col("label") <= 1))


In [50]:
p = model.stages[-1].summary

print("Coefficient Standard Errors: " + str(p.coefficientStandardErrors))
print("T Values: " + str(p.tValues))
print("P Values: " + str(p.pValues))
print("Dispersion: " + str(p.dispersion))
print("Null Deviance: " + str(p.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(p.residualDegreeOfFreedomNull))
print("Deviance: " + str(p.deviance))
print("Residual Degree Of Freedom: " + str(p.residualDegreeOfFreedom))
print("AIC: " + str(p.aic))
print("Deviance Residuals: ")
p.residuals().show()

Coefficient Standard Errors: [0.0027153657861677017, 0.0033576183070386195, 0.0036618804887292426, 0.0035754531498949087, 0.004800442480010473, 0.0053070346268332775, 0.006044648209200047, 0.006661858012195807, 0.007406240336102599, 0.007567900680939748, 0.008093150133059149, 0.008512938676244109, 0.008348498511501063, 0.009076922598213258, 0.009345658737006969, 3.570481466497785e-06, 0.0077130328315039385, 0.01130608609950618, 2.2163928182556967e-10, 4.2738460918044414e-08, 1.7226360583964896e-08, 1.5410312395327676e-09, 1.1484656996653029e-07, 1.2769605473113265e-07, 3.5900730800140656e-07, 0.012775368232153424, 0.013846855355999527]
T Values: [-11.13953124016816, 19.273340427415945, -6.0592684049208865, 0.9099659049882795, -2.4512213897006436, -1.3093167681591893, -1.8914994368699147, -2.1100010055623724, -3.195706289762549, 0.34554268948276684, 0.7313830924481478, -1.3424645212472244, 2.7347575160967175, -2.395285013922758, -3.397615083227601, 19.49128313879151, 44.36488143421366, 

In [46]:
summary = {"Labels":cols,"coefficient Std Err":p.coefficientStandardErrors,"T Values":p.tValues,"P Values":p.pValues}

SyntaxError: invalid syntax (<ipython-input-46-f97013934ad1>, line 1)

In [None]:
pd.DataFrame(summary,columns=["Labels","coefficient Std Err","T Values","P Values"])



In [4]:
#check mean and stddev
descriptionCVR.filter((F.col("summary") =="mean") | (F.col("summary") =="stddev")).show()

+-------+-------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+-----------------+-------------------+------------------+------------------+------------------+-------------------+
|summary|          cvrNummer|             label|       AarsVaerk_1|       AarsVaerk_2|      AarsVaerk_3|      AarsVaerk_4|       AarsVaerk_5|       AarsVaerk_6|      AarsVaerk_7| 

In [5]:
windowSpecRank =(Window.partitionBy(F.col("cvrNummer"))).orderBy(F.col("gyldigFra").desc())

groupCols = ["cvrNummer","vaerdi"]

companyNameDf = (sqlContext
                 .read
                 .parquet("/home/svanhmic/workspace/Python/Erhvervs/data/cdata/"+"companyCvrData")
                 .withColumn(colName="rank",col=F.rank().over(windowSpecRank))
                 .filter((F.col("rank")==1) & (F.col("sekvensnr")==0))
                 .select([F.col(i) for i in groupCols])
                 .withColumnRenamed(existing="vaerdi",new="navn")
                 .orderBy(F.col("cvrNummer"))
                 .cache()
                )
companyNameDf.show(2)

+---------+--------------------+
|cvrNummer|                navn|
+---------+--------------------+
| 10000009|              YELLOW|
| 10000025|WATERFRONT CONNEC...|
+---------+--------------------+
only showing top 2 rows



In [6]:
#take ln(x+1) of features

labelCols = ["cvrNummer","label","status"]
logFeatCols = [i for i in renamedDf.columns if i not in labelCols]
#print(logFeatCols)
mininum = descriptionCVR.filter(F.col("summary")=="min").collect()[0]
#print(mininum)


logDf = (renamedDf
         .select([F.col("cvrNummer"),F.col("label")]+[F.log1p(F.col(i)-F.lit(mininum[i])).alias(i) for i in logFeatCols])
         .na
         .fill(0.0,logFeatCols)
         
        )
#logDf.show(2)


#First convert features to vetor
toDenseUDf = F.udf(lambda x: Vectors.dense(x.toArray()),VectorUDT())
vectorizer = VectorAssembler(inputCols=logFeatCols,outputCol="features")

rawVectorDataDf = (vectorizer.transform(renamedDf                                                                               
                                        .join(companyNameDf,(companyNameDf["cvrNummer"]==renamedDf["cvrNummer"]),"inner")
                                        .drop(companyNameDf["cvrNummer"])
                                        #.select(*logColsSelected) 
                                        .na
                                        .fill(0.0,logFeatCols)
                                        .distinct()
                                       )
                   .select(["navn"]+labelCols+[toDenseUDf(vectorizer.getOutputCol()).alias(vectorizer.getOutputCol())])
                  )

standardScale = StandardScaler(withMean=True,withStd=True,inputCol=vectorizer.getOutputCol(),outputCol="scaledFeatures")
standardScaleModel = standardScale.fit(rawVectorDataDf)
scaledFeaturesDf = (standardScaleModel
                    .transform(rawVectorDataDf)
                    .drop("features")
                    .withColumnRenamed(existing="scaledFeatures",new="features")
                   )

scaledFeaturesDf.show()

+--------------------+---------+-----+--------------------+--------------------+
|                navn|cvrNummer|label|              status|            features|
+--------------------+---------+-----+--------------------+--------------------+
|   EMPAKA KARTONNAGE| 10016533|    1|            [NORMAL]|[0.12721405809989...|
|      DET GAMLE GODS| 10016606|    1|            [NORMAL]|[-0.1483813281958...|
|               DIXEN| 10018064|    0|[OPLØST EFTER KON...|[-0.1483813281958...|
|        TRELBORG VVS| 10063760|    1|            [NORMAL]|[-0.1483813281958...|
|            CYBERSUN| 10065917|    0|[OPLØST EFTER KON...|[-0.0258944898421...|
|                HME2| 10080207|    1|            [NORMAL]|[-0.1483813281958...|
|TØMRERFIRMAET HER...| 10082528|    1|            [NORMAL]|[-0.1177596186074...|
|    MØRKHOLT VINDUER| 10096227|    1|            [NORMAL]|[-0.1483813281958...|
|                KISØ| 10108993|    1|            [NORMAL]|[-0.1177596186074...|
|               C-CUT| 10117

In [14]:
#put them into a feature vecto
vectorizedTestDf = scaledFeaturesDf.filter(F.col("label") <= 1).sampleBy("label", fractions={0: 0.2, 1: 0.2}, seed=42)
vectorizedTestDf.groupBy("label").count().show()

scaledCvrDf = scaledFeaturesDf.select(F.col("cvrNummer"))
cvrTestDf = vectorizedTestDf.select("cvrNummer")
cvrTrainDf = scaledCvrDf.subtract(cvrTestDf) #take the other partion as training set

vectorizedTrainDf = (scaledFeaturesDf
                     .filter(F.col("label") <= 1)
                     .join(cvrTrainDf,(scaledFeaturesDf["cvrNummer"] == cvrTrainDf["cvrNummer"]),"inner")
                     .drop(cvrTrainDf["cvrNummer"])
                    )
vectorizedTrainDf.groupBy("label").count().show()
print("Number of data points: "+str(scaledFeaturesDf.count()))
print("Number of data points train: "+str(vectorizedTrainDf.select("cvrNummer").count()))
print("Number of data points test: "+str(vectorizedTestDf.select("cvrNummer").count()))
#vectorizedTrainDf.printSchema()
#print(vectorizedTrainDf.first())

+-----+-----+
|label|count|
+-----+-----+
|    0| 6078|
|    1|25756|
+-----+-----+

+-----+------+
|label| count|
+-----+------+
|    0| 24465|
|    1|103472|
+-----+------+

Number of data points: 160648
Number of data points train: 127937
Number of data points test: 31834


In [15]:
vectorizedTrainDf.show()

+--------------------+-----+--------------------+--------------------+---------+
|                navn|label|              status|            features|cvrNummer|
+--------------------+-----+--------------------+--------------------+---------+
|SKYTTENS HANDEL O...|    1|[OPLØST EFTER FUS...|[-0.1483813281958...| 10019052|
|            DIKI.NET|    0|[OPLØST EFTER KON...|[-0.1483813281958...| 10026113|
|                CTEK|    1|[OPLØST EFTER FUS...|[-0.1177596186074...| 10040523|
|      VG ENTREPRENØR|    1|            [NORMAL]|[0.43343115398404...| 10057426|
|NORDBYENS OLIEFYR...|    1|            [NORMAL]|[-0.1483813281958...| 10089514|
|EXPRESS LABELLING...|    1|[TVANGSOPLØST, UN...|[-0.1483813281958...| 10091713|
|PSYKOLOGERNE VED ...|    1|[OPLØST EFTER FUS...|[-0.1790030377842...| 10108624|
|         RAH HOLDING|    1|            [NORMAL]|[-0.1483813281958...| 10127351|
|AMAGER BROLÆGGERF...|    0|[OPLØST EFTER KON...|[-0.1177596186074...| 10128587|
|        ART OF JEWEL|    1|

In [16]:
#Train the logistic regressionmodel
lr = LogisticRegression()
grid = (ParamGridBuilder()
        .baseOn({lr.predictionCol:"prediction"})
        .baseOn({lr.rawPredictionCol:"rawPrediction"})
        .baseOn({lr.probabilityCol:"probability"})
        .baseOn({lr.labelCol:"label"})
        .baseOn({lr.featuresCol:"features"})
        .addGrid(param=lr.elasticNetParam,values=[0.1,1.0])
        .addGrid(param=lr.getMaxIter,values=[10])
        .build()
       )
evaluate = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
crossVal = CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluate,numFolds=10)

crossValModel = crossVal.fit(dataset=vectorizedTrainDf)
evaluate.evaluate(crossValModel.transform(vectorizedTestDf))
#coef = lrModel.coefficients

0.8090805096973791

In [17]:
bestModel = crossValModel.bestModel

In [18]:
#test the values
result = bestModel.transform(vectorizedTestDf)

In [12]:
#

In [19]:
#result.orderBy("prediction").show(100)
confCols = [F.col(i) for i in ["TP","TN","FP","FN"]]


csCols = [F.when((F.col("label")==1) & (F.col("difference") == 0),"TP")
          ,F.when((F.col("label")==0) & (F.col("difference") == 0),"TN")
          ,F.when(F.col("difference") == 1,"FN")
          ,F.when(F.col("difference") == -1,"FP")
         ]

confusionDf = result.select(F.col("label"),F.col("prediction"),(F.col("label")-F.col("prediction")).alias("difference"))
(confusionDf
 .select(F.coalesce(*csCols).alias("cases")  
         #,.otherwise(0).alias("FP")
         #,.otherwise(0).alias("FN")
        )
 .groupBy("cases").count()
).show()

 

+-----+-----+
|cases|count|
+-----+-----+
|   TP|25710|
|   TN|   36|
|   FN|   46|
|   FP| 6042|
+-----+-----+



In [17]:
crossValModel.bestModel.hasSummary

True

In [20]:
summary = crossValModel.bestModel.summary

In [24]:
summary.predictions.show()

+--------------------+-----+--------------------+--------------------+---------+--------------------+--------------------+----------+
|                navn|label|              status|            features|cvrNummer|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+---------+--------------------+--------------------+----------+
|SKYTTENS HANDEL O...|    1|[OPLØST EFTER FUS...|[-0.1483813281958...| 10019052|[-0.1436615979228...|[0.46414624375980...|       1.0|
|            DIKI.NET|    0|[OPLØST EFTER KON...|[-0.1483813281958...| 10026113|[-0.2470744652013...|[0.43854370342953...|       1.0|
|                CTEK|    1|[OPLØST EFTER FUS...|[-0.1177596186074...| 10040523|[-0.6803097893868...|[0.33619216432178...|       1.0|
|      VG ENTREPRENØR|    1|            [NORMAL]|[0.43343115398404...| 10057426|[-3.7227933151943...|[0.02359613625108...|       1.0|
|NORDBYENS OLIEFYR...|    1|            [NORMAL]|[-0.148381328