In [41]:
SQLContext.newSession(sqlContext)
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler,StandardScaler,RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.linalg import VectorUDT,Vectors
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import Window
from pyspark.ml import Pipeline
from pyspark.ml.regression import GeneralizedLinearRegression


import re
from tabulate import tabulate
import random
import sys
import numpy as np

In [88]:
#import data and rename bad name rank into vaerdiSlope
df = sqlContext.read.parquet("/home/svanhmic/workspace/Python/Erhvervs/data/cdata/featureDataCvr")
df.select(["cvrNummer"])
rankCols = [re.sub(pattern="rank_",repl="vaerdiSlope_",string=i) for i in df.columns ]
renamedDf = (df.withColumn(colName="reklamebeskyttet",col=F.col("reklamebeskyttet").cast("integer"))
             .select([F.col(val).alias(rankCols[idx]) for idx,val in enumerate(df.columns)])
             .withColumn(col=F.col("totalAabneEnheder").cast("double"),colName="totalAabneEnheder")
             .withColumn(col=F.col("totalLukketEnheder").cast("double"),colName="totalLukketEnheder")
             .withColumn(col=F.col("reklamebeskyttet").cast("double"),colName="reklamebeskyttet")
             .withColumn(col=F.col("label").cast("double"),colName="label")
             
             )
renamedDf.show()

+---------+--------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------------+-------------------+-------------+-------------+-------------+-------------+-------------+----------------+
|cvrNummer|  status|label|AarsVaerk_1|AarsVaerk_2|AarsVaerk_3|AarsVaerk_4|AarsVaerk_5|AarsVaerk_6|AarsVaerk_7|AarsVaerk_8|AarsVaerk_9|AarsVaerk_10|AarsVaerk_11|AarsVaerk_12|AarsVaerk_13|AarsVaerk_14|AarsVaerk_15|medArb_1|medArb_2|medArb_3|medArb_4|medArb_5|medArb_6|medArb_7|medArb_8|medArb_9|medArb_10|medArb_11|medArb_12|medArb_13|medArb_14|medArb_15|avgVarighed|totalAabneEnheder|totalLukketEnheder|      vaerdiSlope_1|      vaerdiSlope_2|vaerdiSlope_3|vaerdiSlope_4|

In [99]:
strs = ""
excludedCols = ["medArb_"+str(i) for i in range(1,16)]+["cvrNummer","label","status"]
for i in renamedDf.columns:
    if i not in excludedCols:
        strs += i+" + "

#excludedCols    
imputedDf = renamedDf.fillna(value=0.0)
formula = RFormula(formula="label ~ "+strs[:-3],labelCol="label")

glr = GeneralizedLinearRegression(family="binomial", link="logit", maxIter=10, regParam=0.3)
print(glr.
lr = LogisticRegression()

pipeline = Pipeline(stages=[formula,glr])

grid = (ParamGridBuilder()
        .baseOn({lr.predictionCol:"prediction"})
        .baseOn({lr.rawPredictionCol:"rawPrediction"})
        .baseOn({lr.probabilityCol:"probability"})
        .baseOn({lr.labelCol:"label"})
        .baseOn({lr.featuresCol:"features"})
        .addGrid(param=lr.elasticNetParam,values=[0.1,1.0])
        .addGrid(param=lr.getMaxIter,values=[10])
        .build()
       )

evaluate = BinaryClassificationEvaluator()

trainEvalModel = TrainValidationSplit(estimator=pipeline,estimatorParamMaps=grid,evaluator=evaluate,trainRatio=0.8)

prediction


In [96]:
cols = [i for i in renamedDf.columns if i not in excludedCols]+["label"]

model = trainEvalModel.fit(imputedDf.select(*cols).filter(F.col("label") <= 1))

IllegalArgumentException: 'Field "rawPrediction" does not exist.'

In [91]:
predict = model..transform(imputedDf.select(*cols).filter(F.col("label") <= 1))
imputedDf.select(*cols).filter(F.col("label") <= 1).printSchema()

root
 |-- AarsVaerk_1: double (nullable = false)
 |-- AarsVaerk_2: double (nullable = false)
 |-- AarsVaerk_3: double (nullable = false)
 |-- AarsVaerk_4: double (nullable = false)
 |-- AarsVaerk_5: double (nullable = false)
 |-- AarsVaerk_6: double (nullable = false)
 |-- AarsVaerk_7: double (nullable = false)
 |-- AarsVaerk_8: double (nullable = false)
 |-- AarsVaerk_9: double (nullable = false)
 |-- AarsVaerk_10: double (nullable = false)
 |-- AarsVaerk_11: double (nullable = false)
 |-- AarsVaerk_12: double (nullable = false)
 |-- AarsVaerk_13: double (nullable = false)
 |-- AarsVaerk_14: double (nullable = false)
 |-- AarsVaerk_15: double (nullable = false)
 |-- avgVarighed: double (nullable = false)
 |-- totalAabneEnheder: double (nullable = false)
 |-- totalLukketEnheder: double (nullable = false)
 |-- vaerdiSlope_1: double (nullable = false)
 |-- vaerdiSlope_2: double (nullable = false)
 |-- vaerdiSlope_3: double (nullable = false)
 |-- vaerdiSlope_4: double (nullable = false)


In [95]:
p = model.stages[1].summary

print("Coefficient Standard Errors: " + str(p.coefficientStandardErrors))
print("T Values: " + str(p.tValues))
print("P Values: " + str(p.pValues))
print("Dispersion: " + str(p.dispersion))
print("Null Deviance: " + str(p.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(p.residualDegreeOfFreedomNull))
print("Deviance: " + str(p.deviance))
print("Residual Degree Of Freedom: " + str(p.residualDegreeOfFreedom))
print("AIC: " + str(p.aic))
print("Deviance Residuals: ")
p.residuals().show()

Coefficient Standard Errors: [0.00060663308727155, 0.0007519666947785978, 0.000711341189202029, 0.0006439644831685794, 0.0008270035385413254, 0.0008046613897137305, 0.0010321426755108202, 0.0009805126061947754, 0.0011715362209909488, 0.0014315023877709083, 0.0016039269138056539, 0.0014605483890031716, 0.0014580786819210197, 0.0017614815929307924, 0.0014442858090305548, 1.4776939364887241e-06, 0.0024527979391702794, 0.005641754851351828, 2.1602125780588623e-08, 4.482988627440235e-08, 1.941301702726079e-07, 2.0998993359408759e-07, 6.902319351199482e-07, 1.4340377734315118e-05, 7.894013037571729e-06, 0.006566878379619086, 0.0063388762292734815]
T Values: [-283.03449896138227, 334.1714838200412, -109.68587851294157, -139.94269501038147, -179.80231389454045, 31.057119087710777, 48.42007280143863, 73.35819928413993, -101.56605125515793, -76.19650268271621, -83.18425599472717, 169.47191611885378, 109.72038472354501, -449.21664647282574, 377.14061210108883, 12.46005415872235, 68.8799591490644,

In [3]:
#check null values.
descriptionCVR = renamedDf.describe()
descriptionCVR.select("summary").show()
descriptionCVR.filter(F.col("summary")=="count").show()




+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

+-------+---------+------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+----------------+
|summary|cvrNummer| label|AarsVaerk_1|AarsVaerk_2|AarsVaerk_3|AarsVaerk_4|AarsVaerk_5|AarsVaerk_6|AarsVaerk_7|AarsVaerk_8|AarsVaerk_9|AarsVaerk_10|AarsVaerk_11|AarsVaerk_12|AarsVaerk_13|AarsVaerk_14|AarsVaerk_15|medArb_1|medArb_2|medArb_3|medArb_4|medArb_5|medArb_6|medArb_7|medArb_8|medArb_9|medArb_10|medArb_11|medArb_12|medArb_13|medArb_14|medArb_15|avgVarighed|totalAabneEnheder|totalLuk

In [4]:
#check mean and stddev
descriptionCVR.filter((F.col("summary") =="mean") | (F.col("summary") =="stddev")).show()

+-------+-------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+-----------------+-------------------+------------------+------------------+------------------+-------------------+
|summary|          cvrNummer|             label|       AarsVaerk_1|       AarsVaerk_2|      AarsVaerk_3|      AarsVaerk_4|       AarsVaerk_5|       AarsVaerk_6|      AarsVaerk_7| 

In [5]:
windowSpecRank =(Window.partitionBy(F.col("cvrNummer"))).orderBy(F.col("gyldigFra").desc())

groupCols = ["cvrNummer","vaerdi"]

companyNameDf = (sqlContext
                 .read
                 .parquet("/home/svanhmic/workspace/Python/Erhvervs/data/cdata/"+"companyCvrData")
                 .withColumn(colName="rank",col=F.rank().over(windowSpecRank))
                 .filter((F.col("rank")==1) & (F.col("sekvensnr")==0))
                 .select([F.col(i) for i in groupCols])
                 .withColumnRenamed(existing="vaerdi",new="navn")
                 .orderBy(F.col("cvrNummer"))
                 .cache()
                )
companyNameDf.show(2)

+---------+--------------------+
|cvrNummer|                navn|
+---------+--------------------+
| 10000009|              YELLOW|
| 10000025|WATERFRONT CONNEC...|
+---------+--------------------+
only showing top 2 rows



In [6]:
#take ln(x+1) of features

labelCols = ["cvrNummer","label","status"]
logFeatCols = [i for i in renamedDf.columns if i not in labelCols]
#print(logFeatCols)
mininum = descriptionCVR.filter(F.col("summary")=="min").collect()[0]
#print(mininum)


logDf = (renamedDf
         .select([F.col("cvrNummer"),F.col("label")]+[F.log1p(F.col(i)-F.lit(mininum[i])).alias(i) for i in logFeatCols])
         .na
         .fill(0.0,logFeatCols)
         
        )
#logDf.show(2)


#First convert features to vetor
toDenseUDf = F.udf(lambda x: Vectors.dense(x.toArray()),VectorUDT())
vectorizer = VectorAssembler(inputCols=logFeatCols,outputCol="features")

rawVectorDataDf = (vectorizer.transform(renamedDf                                                                               
                                        .join(companyNameDf,(companyNameDf["cvrNummer"]==renamedDf["cvrNummer"]),"inner")
                                        .drop(companyNameDf["cvrNummer"])
                                        #.select(*logColsSelected) 
                                        .na
                                        .fill(0.0,logFeatCols)
                                        .distinct()
                                       )
                   .select(["navn"]+labelCols+[toDenseUDf(vectorizer.getOutputCol()).alias(vectorizer.getOutputCol())])
                  )

standardScale = StandardScaler(withMean=True,withStd=True,inputCol=vectorizer.getOutputCol(),outputCol="scaledFeatures")
standardScaleModel = standardScale.fit(rawVectorDataDf)
scaledFeaturesDf = (standardScaleModel
                    .transform(rawVectorDataDf)
                    .drop("features")
                    .withColumnRenamed(existing="scaledFeatures",new="features")
                   )

scaledFeaturesDf.show()

+--------------------+---------+-----+--------------------+--------------------+
|                navn|cvrNummer|label|              status|            features|
+--------------------+---------+-----+--------------------+--------------------+
|   EMPAKA KARTONNAGE| 10016533|    1|            [NORMAL]|[0.12721405809989...|
|      DET GAMLE GODS| 10016606|    1|            [NORMAL]|[-0.1483813281958...|
|               DIXEN| 10018064|    0|[OPLØST EFTER KON...|[-0.1483813281958...|
|        TRELBORG VVS| 10063760|    1|            [NORMAL]|[-0.1483813281958...|
|            CYBERSUN| 10065917|    0|[OPLØST EFTER KON...|[-0.0258944898421...|
|                HME2| 10080207|    1|            [NORMAL]|[-0.1483813281958...|
|TØMRERFIRMAET HER...| 10082528|    1|            [NORMAL]|[-0.1177596186074...|
|    MØRKHOLT VINDUER| 10096227|    1|            [NORMAL]|[-0.1483813281958...|
|                KISØ| 10108993|    1|            [NORMAL]|[-0.1177596186074...|
|               C-CUT| 10117

In [14]:
#put them into a feature vecto
vectorizedTestDf = scaledFeaturesDf.filter(F.col("label") <= 1).sampleBy("label", fractions={0: 0.2, 1: 0.2}, seed=42)
vectorizedTestDf.groupBy("label").count().show()

scaledCvrDf = scaledFeaturesDf.select(F.col("cvrNummer"))
cvrTestDf = vectorizedTestDf.select("cvrNummer")
cvrTrainDf = scaledCvrDf.subtract(cvrTestDf) #take the other partion as training set

vectorizedTrainDf = (scaledFeaturesDf
                     .filter(F.col("label") <= 1)
                     .join(cvrTrainDf,(scaledFeaturesDf["cvrNummer"] == cvrTrainDf["cvrNummer"]),"inner")
                     .drop(cvrTrainDf["cvrNummer"])
                    )
vectorizedTrainDf.groupBy("label").count().show()
print("Number of data points: "+str(scaledFeaturesDf.count()))
print("Number of data points train: "+str(vectorizedTrainDf.select("cvrNummer").count()))
print("Number of data points test: "+str(vectorizedTestDf.select("cvrNummer").count()))
#vectorizedTrainDf.printSchema()
#print(vectorizedTrainDf.first())

+-----+-----+
|label|count|
+-----+-----+
|    0| 6078|
|    1|25756|
+-----+-----+

+-----+------+
|label| count|
+-----+------+
|    0| 24465|
|    1|103472|
+-----+------+

Number of data points: 160648
Number of data points train: 127937
Number of data points test: 31834


In [15]:
vectorizedTrainDf.show()

+--------------------+-----+--------------------+--------------------+---------+
|                navn|label|              status|            features|cvrNummer|
+--------------------+-----+--------------------+--------------------+---------+
|SKYTTENS HANDEL O...|    1|[OPLØST EFTER FUS...|[-0.1483813281958...| 10019052|
|            DIKI.NET|    0|[OPLØST EFTER KON...|[-0.1483813281958...| 10026113|
|                CTEK|    1|[OPLØST EFTER FUS...|[-0.1177596186074...| 10040523|
|      VG ENTREPRENØR|    1|            [NORMAL]|[0.43343115398404...| 10057426|
|NORDBYENS OLIEFYR...|    1|            [NORMAL]|[-0.1483813281958...| 10089514|
|EXPRESS LABELLING...|    1|[TVANGSOPLØST, UN...|[-0.1483813281958...| 10091713|
|PSYKOLOGERNE VED ...|    1|[OPLØST EFTER FUS...|[-0.1790030377842...| 10108624|
|         RAH HOLDING|    1|            [NORMAL]|[-0.1483813281958...| 10127351|
|AMAGER BROLÆGGERF...|    0|[OPLØST EFTER KON...|[-0.1177596186074...| 10128587|
|        ART OF JEWEL|    1|

In [16]:
#Train the logistic regressionmodel
lr = LogisticRegression()
grid = (ParamGridBuilder()
        .baseOn({lr.predictionCol:"prediction"})
        .baseOn({lr.rawPredictionCol:"rawPrediction"})
        .baseOn({lr.probabilityCol:"probability"})
        .baseOn({lr.labelCol:"label"})
        .baseOn({lr.featuresCol:"features"})
        .addGrid(param=lr.elasticNetParam,values=[0.1,1.0])
        .addGrid(param=lr.getMaxIter,values=[10])
        .build()
       )
evaluate = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
crossVal = CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluate,numFolds=10)

crossValModel = crossVal.fit(dataset=vectorizedTrainDf)
evaluate.evaluate(crossValModel.transform(vectorizedTestDf))
#coef = lrModel.coefficients

0.8090805096973791

In [17]:
bestModel = crossValModel.bestModel

In [18]:
#test the values
result = bestModel.transform(vectorizedTestDf)

In [12]:
#

In [19]:
#result.orderBy("prediction").show(100)
confCols = [F.col(i) for i in ["TP","TN","FP","FN"]]


csCols = [F.when((F.col("label")==1) & (F.col("difference") == 0),"TP")
          ,F.when((F.col("label")==0) & (F.col("difference") == 0),"TN")
          ,F.when(F.col("difference") == 1,"FN")
          ,F.when(F.col("difference") == -1,"FP")
         ]

confusionDf = result.select(F.col("label"),F.col("prediction"),(F.col("label")-F.col("prediction")).alias("difference"))
(confusionDf
 .select(F.coalesce(*csCols).alias("cases")  
         #,.otherwise(0).alias("FP")
         #,.otherwise(0).alias("FN")
        )
 .groupBy("cases").count()
).show()

 

+-----+-----+
|cases|count|
+-----+-----+
|   TP|25710|
|   TN|   36|
|   FN|   46|
|   FP| 6042|
+-----+-----+



In [17]:
crossValModel.bestModel.hasSummary

True

In [20]:
summary = crossValModel.bestModel.summary

In [24]:
summary.predictions.show()

+--------------------+-----+--------------------+--------------------+---------+--------------------+--------------------+----------+
|                navn|label|              status|            features|cvrNummer|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+---------+--------------------+--------------------+----------+
|SKYTTENS HANDEL O...|    1|[OPLØST EFTER FUS...|[-0.1483813281958...| 10019052|[-0.1436615979228...|[0.46414624375980...|       1.0|
|            DIKI.NET|    0|[OPLØST EFTER KON...|[-0.1483813281958...| 10026113|[-0.2470744652013...|[0.43854370342953...|       1.0|
|                CTEK|    1|[OPLØST EFTER FUS...|[-0.1177596186074...| 10040523|[-0.6803097893868...|[0.33619216432178...|       1.0|
|      VG ENTREPRENØR|    1|            [NORMAL]|[0.43343115398404...| 10057426|[-3.7227933151943...|[0.02359613625108...|       1.0|
|NORDBYENS OLIEFYR...|    1|            [NORMAL]|[-0.148381328