In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import functions as F
from functools import reduce
from pyspark.sql.window import Window

#Python imports
import sys

In [2]:
#spark = SparkSession(sc).builder.master("local[*]").appName("TestingCvr").getOrCreate()
conf = sc.getConf()
conf.setAppName("TestingCvr")
print(conf.getAll())
sqlContext = SQLContext(sc)

[('hive.metastore.warehouse.dir', 'file:/home/svanhmic/workspace/Python/Erhvervs/src/notebooks/cvr/spark-warehouse'), ('spark.driver.port', '36893'), ('spark.sql.catalogImplementation', 'hive'), ('spark.rdd.compress', 'True'), ('spark.app.id', 'local-1483625885233'), ('spark.app.name', 'TestingCvr'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.id', 'driver'), ('spark.submit.deployMode', 'client'), ('spark.driver.host', '192.168.43.245'), ('spark.driver.memory', '6G')]


In [3]:
if __name__ == '__main__':
    cvrData = "/home/svanhmic/workspace/Python/Erhvervs/data/cdata/virksomhedersMetadata.json/"
    allCVRData = "/home/svanhmic/workspace/Python/Erhvervs/data/cdata/AlleDatavirksomheder.json/"
    cvrDf = sqlContext.read.json(allCVRData)
    #

In [4]:
cvrDf.printSchema()

root
 |-- virksomhed: struct (nullable = true)
 |    |-- aarsbeskaeftigelse: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- aar: long (nullable = true)
 |    |    |    |-- intervalKodeAntalAarsvaerk: string (nullable = true)
 |    |    |    |-- intervalKodeAntalAnsatte: string (nullable = true)
 |    |    |    |-- intervalKodeAntalInklusivEjere: string (nullable = true)
 |    |    |    |-- sidstOpdateret: string (nullable = true)
 |    |-- attributter: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- sekvensnr: long (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |    |    |    |-- vaerdier: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- periode: struct (nullable = true)
 |    |    |    |    |    |    |-- gyldigFra: string (nullable = true)
 |    |    |    |    |    |    |-- gyldigTil: string (nulla

In [5]:
def getNextSchemaLayer(schema,idx,name="name"):
    schemaDict = schema[idx].jsonValue()
    return list([i[name] for i in schemaDict["type"]["fields"][:]])

In [6]:
def createNextLayerTable(df,nonExplodedColumns,explodedColumn,*nonExplodedPrefix):
    '''
        The method takes a dataframe and explodes a column of choice such that the contence is more accessible
        
        input 
            - df: data frame with information
            - nonExplodedColumns: List with columns that are "static" the columns are not altered e.g. cvr 
            - explodedColumn: String with the name of the column that gets exploded, note that column must be of an arraytype
            - nonExplodedPrefix: String if values are nested inside a 
        output
            - output: data frame where nonExplodedColumns are unchanged, except for that the values are dupplicated with
            for the explodedColum
    '''
    if len(nonExplodedPrefix) == 1:
        prefixedStr = nonExplodedPrefix[0]+"."
    else:
        prefixedStr = ".".join(nonExplodedPrefix)
    
    if nonExplodedPrefix is None:
        relationsDf = df.select([df[v].alias(v) for v in nonExplodedColumns]+
                            [F.explode(df[explodedColumn]).alias(explodedColumn)])
    else:
        relationsDf = df.select([df[prefixedStr+v].alias(v) for v in nonExplodedColumns]+
                            [F.explode(df[prefixedStr+explodedColumn]).alias(explodedColumn)])
    
    dfSchema = getNextSchemaLayer(relationsDf.schema,explodedColumn)
    return (relationsDf
            .select([relationsDf[u] for u in nonExplodedColumns]
                    +[relationsDf[explodedColumn][v].alias(v) for v in dfSchema]))

In [7]:
def expandSubCols(df,mainColumn):
    '''
    The method expands all subcolumns in the next layer of mainColumn
    
    input:
        df - data frame with data
        mainColumn - the column that contains the subcolumns that should be flattent
    '''
    dfCols = df.columns
    dfCols.remove(mainColumn)
    return df.select([df[v] for v in dfCols]+
                     [df[mainColumn][u].alias(u) for u in getNextSchemaLayer(schema=df.schema,idx=mainColumn)])

In [8]:
secondLayerSchema = getNextSchemaLayer(cvrDf.schema,"virksomhed")
#cvrDf.select(cvrDf["virksomhed.attributter.sekvensnr"]).show()
print(secondLayerSchema)

['aarsbeskaeftigelse', 'attributter', 'beliggenhedsadresse', 'bibranche1', 'bibranche2', 'bibranche3', 'binavne', 'brancheAnsvarskode', 'cvrNummer', 'dataAdgang', 'deltagerRelation', 'elektroniskPost', 'enhedsNummer', 'enhedstype', 'fejlBeskrivelse', 'fejlRegistreret', 'fejlVedIndlaesning', 'fusioner', 'hjemmeside', 'hovedbranche', 'kvartalsbeskaeftigelse', 'livsforloeb', 'maanedsbeskaeftigelse', 'naermesteFremtidigeDato', 'navne', 'obligatoriskEmail', 'penheder', 'postadresse', 'regNummer', 'reklamebeskyttet', 'samtId', 'sidstIndlaest', 'sidstOpdateret', 'spaltninger', 'status', 'telefaxNummer', 'telefonNummer', 'virkningsAktoer', 'virksomhedMetadata', 'virksomhedsform', 'virksomhedsstatus']


In [32]:
#Aarsvaerk as a table
vaekstDf = createNextLayerTable(cvrDf,["cvrNummer"],"aarsbeskaeftigelse","virksomhed")
#vaekstDf.show()

regexedVaekstDf = (vaekstDf.select(vaekstDf["cvrNummer"],vaekstDf["aar"]
                                      ,F.split(F.regexp_extract("intervalKodeAntalAarsvaerk",r'(\d{1,4}_\d{1,4})',0),r"\_").alias("intervalKodeAntalAarsvaerk")
                                      ,F.split(F.regexp_extract("intervalKodeAntalAnsatte",r'(\d{1,4}_\d{1,4})',0),r"\_").alias("intervalKodeAntalAnsatte")
                                      ,F.split(F.regexp_extract("intervalKodeAntalInklusivEjere",r'(\d{1,4}_\d{1,4})',0),r"\_").alias("intervalKodeAntalInklusivEjere")
                                      ,vaekstDf["sidstOpdateret"]))
regexedVaekstDf.orderBy(["cvrNummer","aar"]).show(50)

+---------+----+--------------------------+------------------------+------------------------------+--------------------+
|cvrNummer| aar|intervalKodeAntalAarsvaerk|intervalKodeAntalAnsatte|intervalKodeAntalInklusivEjere|      sidstOpdateret|
+---------+----+--------------------------+------------------------+------------------------------+--------------------+
| 10000009|1999|                    [1, 1]|                  [1, 1]|                        [1, 1]|2001-07-24T15:50:...|
| 10000025|2000|                    [1, 1]|                  [0, 0]|                        [0, 0]|2002-05-24T17:47:...|
| 10000025|2001|                    [1, 1]|                  [1, 1]|                        [1, 1]|2003-05-12T17:30:...|
| 10000025|2002|                    [1, 1]|                  [1, 1]|                        [1, 1]|2004-06-25T17:50:...|
| 10000025|2003|                    [1, 1]|                  [1, 1]|                        [1, 1]|2005-12-02T17:06:...|
| 10000025|2004|                

In [30]:
#Kvartalsverk  as a tabel
maanedsVerkDf = createNextLayerTable(cvrDf,["cvrNummer"],"kvartalsbeskaeftigelse","virksomhed")
#maanedsVerkDf.show()

reMaanedsVerkDf = (maanedsVerkDf
                   .select(maanedsVerkDf["cvrNummer"],maanedsVerkDf["aar"],maanedsVerkDf["kvartal"]
                           ,F.split(F.regexp_extract("intervalKodeAntalAarsvaerk",r'(\d{1,4}_\d{1,4})',0),r"\_").alias("intervalKodeAntalAarsvaerk")
                           ,F.split(F.regexp_extract("intervalKodeAntalAnsatte",r'(\d{1,4}_\d{1,4})',0),r"\_").alias("intervalKodeAntalAnsatte")                           
                           ,maanedsVerkDf["sidstOpdateret"]))
reMaanedsVerkDf.show()
print(reMaanedsVerkDf.dtypes)

+---------+----+-------+--------------------------+------------------------+--------------------+
|cvrNummer| aar|kvartal|intervalKodeAntalAarsvaerk|intervalKodeAntalAnsatte|      sidstOpdateret|
+---------+----+-------+--------------------------+------------------------+--------------------+
| 27850367|2004|      3|                      null|                  [1, 1]|2005-04-19T17:38:...|
| 27850367|2004|      4|                      null|                  [1, 1]|2005-10-04T12:50:...|
| 27850367|2005|      1|                      null|                  [1, 1]|2005-10-04T17:32:...|
| 27850367|2005|      2|                      null|                  [1, 1]|2005-12-02T16:56:...|
| 27850367|2005|      3|                      null|                  [0, 0]|2006-04-11T17:32:...|
| 27850367|2005|      4|                      null|                  [2, 4]|2006-09-01T12:45:...|
| 27850367|2006|      1|                      null|                  [1, 1]|2007-01-11T16:33:...|
| 27850367|2006|    

In [10]:
#Get the attributes with values and periods as a table

attributesDf = createNextLayerTable(cvrDf,["cvrNummer"],"attributter","virksomhed")
#attributesDf.show(10)
#print(attributesDf.schema["cvrNummer"])
#get column names
attributesCols = attributesDf.columns
attributesCols.remove("vaerdier")

attributesWithValueDf = createNextLayerTable(attributesDf,attributesCols,"vaerdier")
#attributesWithValueDf.show(10)

attributesWithValueAndPeriodDf = expandSubCols(attributesWithValueDf,"periode")
attributesWithValueAndPeriodDf.orderBy(["cvrNummer","type"],ascending=[1, 1]).show(100)

+---------+---------+--------------------+----------+--------------------+--------------------+----------+----------+
|cvrNummer|sekvensnr|                type|vaerditype|      sidstOpdateret|              vaerdi| gyldigFra| gyldigTil|
+---------+---------+--------------------+----------+--------------------+--------------------+----------+----------+
| 10000009|        0|ARKIV_REGISTRERIN...|    string|2015-02-09T20:00:...|           ApS261654|1999-10-12|2001-12-11|
| 10000009|        0|              FORMÅL|    string|2015-02-09T20:00:...|Selskabets formål...|1999-10-12|2001-12-11|
| 10000009|        0|FØRSTE_REGNSKABSP...|      date|2015-02-09T20:00:...|          2000-09-30|1999-10-12|2001-12-11|
| 10000009|        0|FØRSTE_REGNSKABSP...|      date|2015-02-09T20:00:...|          1999-08-01|1999-10-12|2001-12-11|
| 10000009|        0|             KAPITAL|   decimal|2015-02-09T20:00:...|           125000.00|1999-10-12|2001-12-11|
| 10000009|        0|       KAPITALVALUTA|    string|201

In [11]:
showAttributes = attributesWithValueAndPeriodDf.groupBy(["type"]).count().orderBy("count").collect()
for v in showAttributes:
    print("type: {} count: {}".format(v[0],v[1]))

type: SØV_IEF_TILSYN_TEKST count: 2
type: SØV_IEF_TILSYN count: 13
type: REGNSKABSÅR_FRITEKST count: 55
type: MYNDIGHED_ANDEN count: 99
type: STIFTET_FØR_1900 count: 144
type: SOCIAL_ØKONOMISK_VIRKSOMHED count: 193
type: OFFENTLIG_EJERBOG count: 275
type: KONCESSIONSDATO count: 351
type: BØRSNOTERET count: 355
type: STADFÆSTELSESDATO count: 501
type: STADFÆSTET_AF count: 501
type: FINANSIEL_DELTYPE count: 755
type: TILSYN_KATEGORI count: 785
type: TILLADELSESDATO_FONDSMYNDIGHED count: 1963
type: FINANSIELT_FORMÅL count: 1976
type: REGISTRERING_LIKVIDATION count: 3264
type: OPLØSNINGSTRUSSEL_SENESTE count: 4265
type: OMFATTET_AF_LOV_OM_HVIDVASK_OG_TERRORFINANSIERING count: 8648
type: GENOPTAGELSE_TVANGSOPLØSNING count: 13496
type: STATSLIG_VIRK count: 39185
type: KAPITAL_DELVIST count: 39393
type: OMLÆGNINGSPERIODE_START count: 47807
type: OMLÆGNINGSPERIODE_SLUT count: 47810
type: KAPITALKLASSER count: 62855
type: EJERREGISTRERING_UNDER_5_PROCENT count: 110211
type: REVISION_FRAVALGT co

In [12]:
#udtrækning af kapital, gyldighedsdatoer og cvr
valuesFromAttributesDf = (attributesWithValueAndPeriodDf
                          .filter(F.col("type") == "KAPITAL")
                          .select("cvrNummer","vaerdi","gyldigFra",F.coalesce("gyldigTil",F.lit(F.current_date())).alias("gyldigTil"))
                          .withColumn(colName="varighed",col=F.datediff(end="gyldigTil",start="gyldigFra")))



In [13]:
print(valuesFromAttributesDf.select("cvrNummer").distinct().count())

499698


In [14]:
valuesFromAttributesDf.show(20)

+---------+----------+----------+----------+--------+
|cvrNummer|    vaerdi| gyldigFra| gyldigTil|varighed|
+---------+----------+----------+----------+--------+
| 26452635| 170011.00|2003-02-24|2010-01-21|    2523|
| 26452635| 140080.00|2002-01-08|2002-06-20|     163|
| 26452635| 145000.00|2002-06-21|2002-09-01|      72|
| 26452635| 155900.00|2002-09-02|2003-02-23|     174|
| 15157984|  80000.00|1991-05-08|1995-09-27|    1603|
| 15111038|1000000.00|1997-11-06|1998-06-30|     236|
| 15111038| 300000.00|1991-04-04|1991-10-08|     187|
| 15111038|7000000.00|1991-10-09|1997-11-05|    2219|
| 14731644| 300000.00|1990-11-20|1993-08-04|     988|
| 14728147| 500000.00|1992-06-30|1996-02-07|    1317|
| 14728147| 300000.00|1990-11-20|1992-06-29|     587|
| 15500387|  80000.00|1991-09-02|1995-01-10|    1226|
| 15501332|  80000.00|1991-09-02|1995-11-22|    1542|
| 15500972| 500000.00|1992-01-08|1995-02-22|    1141|
| 15500972| 300000.00|1991-09-02|1992-01-07|     127|
| 15276703| 300000.00|1991-0

In [15]:
#compute cummulative difference for each cvrnummer between vaerdi

windowSpec = (Window
              .partitionBy(valuesFromAttributesDf["cvrNummer"])
              .orderBy(valuesFromAttributesDf["gyldigFra"])
              )
windowSpec.rowsBetween(-1,0)
valuesAndCumDifDf = (valuesFromAttributesDf
                     .withColumn(colName="CumlativeDiff",col=F.col("vaerdi")-F.lag("vaerdi").over(windowSpec))).cache()


In [16]:
valuesAndCumDifDf.show(250)

+---------+------------+----------+----------+--------+-------------+
|cvrNummer|      vaerdi| gyldigFra| gyldigTil|varighed|CumlativeDiff|
+---------+------------+----------+----------+--------+-------------+
| 10017025|   500000.00|2000-02-01|2002-05-02|     821|         null|
| 10019052|   125000.00|2000-02-23|2002-05-27|     824|         null|
| 10026113|   125000.00|2000-06-23|2004-09-27|    1557|         null|
| 10027926|   600000.00|2000-06-08|2001-01-15|     221|         null|
| 10029325|100000000.00|2000-06-07|2000-12-17|     193|         null|
| 10029325|105000000.00|2000-12-18|2007-12-19|    2557|    5000000.0|
| 10039983|    80000.00|1987-08-30|1993-03-24|    2033|         null|
| 10040523|   125000.00|2000-07-01|2001-05-20|     323|         null|
| 10040523|   500000.00|2001-05-21|2005-10-14|    1607|     375000.0|
| 10040523|   500000.00|2005-10-15|2006-06-29|     257|          0.0|
| 10040523|  1000000.00|2006-06-30|2013-04-04|    2470|     500000.0|
| 10056535|    80000

In [35]:
#check who is væksting in year x
vaekst2015Df = (valuesAndCumDifDf
                .where(condition=F.year("gyldigFra") == 2015)
                .na.drop(subset="CumlativeDiff")
                .orderBy(["cvrNummer","CumlativeDiff"],ascending=[1,0]))
vaekst2015Df.show()

+---------+------------+----------+----------+--------+-------------+
|cvrNummer|      vaerdi| gyldigFra| gyldigTil|varighed|CumlativeDiff|
+---------+------------+----------+----------+--------+-------------+
| 10006252| 25000000.00|2015-05-22|2017-01-05|     594|        1.5E7|
| 10007127|626000000.00|2015-03-27|2016-02-23|     333|      -1.34E7|
| 10008328|  1100000.00|2015-05-20|2017-01-05|     596|     100000.0|
| 10011027|   301000.00|2015-11-30|2017-01-05|     402|       1000.0|
| 10011663|  5325048.00|2015-07-15|2017-01-05|     540|     179365.0|
| 10013267| 14882316.00|2015-02-02|2015-10-20|     260|    1868068.0|
| 10013267| 14903341.00|2015-10-21|2017-01-05|     442|      21025.0|
| 10019265| 66238265.00|2015-01-27|2017-01-05|     709|     700000.0|
| 10022223|  5010000.00|2015-08-26|2017-01-05|     498|      10000.0|
| 10035961|   700000.00|2015-09-19|2017-01-05|     474|    -300000.0|
| 10037468|   125000.00|2015-12-07|2017-01-05|     395|          0.0|
| 10042526|    93750

In [36]:
print(vaekst2015Df.select("cvrNummer").distinct().count().collect())

AttributeError: 'int' object has no attribute 'collect'

In [18]:
#creating deltager as a table
deltagerRelationDf = createNextLayerTable(cvrDf,["cvrNummer"],"deltagerRelation","virksomhed")
#deltagerRelationDf.printSchema()
deltagerRelationCol = deltagerRelationDf.columns
print(deltagerRelationCol)
deltagerRelationCol.remove("deltager")
#deltagerRelationDf.show()

deltagerDf = expandSubCols(deltagerRelationDf,"deltager")
deltagerDf.show(1)
#deltagerDf.printSchema()
deltagerDf = deltagerDf.drop("kontorsteder").drop("organisationer")
deltagerCols = deltagerDf.columns
deltagerCols.remove("navne")

deltagerNamesDf = createNextLayerTable(df=deltagerDf,nonExplodedColumns=deltagerCols,explodedColumn="navne")
deltagerNamesDf.show(10)

['cvrNummer', 'deltager', 'kontorsteder', 'organisationer']
+---------+------------+--------------------+--------------------+------------+----------+-----------------+--------------------+--------------------+
|cvrNummer|kontorsteder|      organisationer| beliggenhedsadresse|enhedsNummer|enhedstype|forretningsnoegle|               navne|       sidstIndlaest|
+---------+------------+--------------------+--------------------+------------+----------+-----------------+--------------------+--------------------+
| 27850367|          []|[[WrappedArray([0...|[[null,null,null,...|  4000218628|    PERSON|             null|[[Gitte Heegaard ...|2015-12-10T04:38:...|
+---------+------------+--------------------+--------------------+------------+----------+-----------------+--------------------+--------------------+
only showing top 1 row

+---------+--------------------+------------+----------+-----------------+--------------------+--------------------+--------------------+--------------------+
|c

In [19]:
deltagerNamesDf.printSchema()

root
 |-- cvrNummer: long (nullable = true)
 |-- beliggenhedsadresse: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bogstavFra: string (nullable = true)
 |    |    |-- bogstavTil: string (nullable = true)
 |    |    |-- bynavn: string (nullable = true)
 |    |    |-- conavn: string (nullable = true)
 |    |    |-- etage: string (nullable = true)
 |    |    |-- fritekst: string (nullable = true)
 |    |    |-- husnummerFra: long (nullable = true)
 |    |    |-- husnummerTil: long (nullable = true)
 |    |    |-- kommune: struct (nullable = true)
 |    |    |    |-- kommuneKode: long (nullable = true)
 |    |    |    |-- kommuneNavn: string (nullable = true)
 |    |    |    |-- periode: struct (nullable = true)
 |    |    |    |    |-- gyldigFra: string (nullable = true)
 |    |    |    |    |-- gyldigTil: string (nullable = true)
 |    |    |    |-- sidstOpdateret: string (nullable = true)
 |    |    |-- landekode: string (nullable = true)
 |    

In [53]:
expandedHovedBrancheWithDateDf.columns

['cvrNummer',
 'branchekode',
 'branchetekst',
 'sidstOpdateret',
 'gyldigFra',
 'gyldigTil']

In [56]:
brancheList = ["cvrNummer","hovedbranche"]#"bibranche1","bibranche2","bibranche3"

companyBrancherDf = (cvrDf
                     .select([F.col("virksomhed."+f) for f in brancheList]))
#companyBrancherDf.show()
brancheList.remove("hovedbranche")
expandedHovedBrancheDf = createNextLayerTable(df=companyBrancherDf,nonExplodedColumns=brancheList,explodedColumn="hovedbranche")

expandedHovedBrancheWithDateDf = (expandSubCols(expandedHovedBrancheDf,"periode")
                                  .orderBy(["cvrNummer","gyldigFra"],ascending=[1,1]))
#expandedHovedBrancheWithDateDf.show(truncate=False)


hovedbrancher2015Df = (expandedHovedBrancheWithDateDf
                       .filter((F.year(F.col("gyldigFra"))==2015))
                       .withColumn(colName="DayOfYear",F.dayofyear(F.col("gyldigFra")))
                       .groupBy(expandedHovedBrancheWithDateDf.columns)
                       .max(F.col()))   
hovedbrancher2015Df.show()

hovedbrancher2015Cols = hovedbrancher2015Df.columns

prefixedHovedBranch2015Df = (ovedbrancher2015Df
                             .select([F.col(x).alias("branche"+x) for x in hovedbrancher2015Cols]))
#prefixedHovedBranch2015Df.show()


Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.col. Trace:
py4j.Py4JException: Method col([class org.apache.spark.sql.Column]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:339)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:211)
	at java.lang.Thread.run(Thread.java:745)



In [None]:
#investigate whether a cvrNumber is listed twice in 
prefixedHovedBranch2015Df 

In [21]:
#expandedHovedBrancheWithDateDf.select(F.col("cvrNummer")
#                                      ,F.year(F.col("gyldigFra")).alias("aar")
#                                      ,F.col("branchekode")
#                                      ,F.col("branchetekst")).groupBy(F.col("branchekode"),F.col("branchetekst"),F.col("aar")).count().show()

+-----------+--------------------+----+-----+
|branchekode|        branchetekst| aar|count|
+-----------+--------------------+----+-----+
|     741020|Kommunikationsdes...|2008| 1779|
|     514610|Engroshandel med ...|2000|   51|
|     741490|Anden virksomheds...|2003| 2909|
|     713400|Udlejning af mask...|2005|  167|
|     851210|Alment praktisere...|1986|  124|
|     912000|       Fagforeninger|1986|   24|
|     316290|Fremstilling af a...|2007|   39|
|     139210|Fremstilling af b...|2008|  182|
|     331030|Fremstilling af e...|2000|    4|
|     512200|             Rumfart|1985|   12|
|     522120|Parkering og vejh...|2008|  258|
|     711230|Opstilling og lev...|2008|   31|
|     452510|   Murerforretninger|1985|  110|
|     515400|Engroshandel med ...|2002|   66|
|     332090|Fremstilling af a...|2002|   15|
|     513900|Ikke-specialisere...|2003|   59|
|     282900|Fremstilling af a...|2008|  152|
|     602200|          Taxikørsel|1973|   49|
|     515100|Engroshandel med ...|

In [45]:
vaekst2015Cols = vaekst2015Df.columns
branche2015Cols = prefixedHovedBranch2015Df.columns
vaekstWithBrancher2015Df = (vaekst2015Df
                            .join(prefixedHovedBranch2015Df,vaekst2015Df["cvrNummer"]==prefixedHovedBranch2015Df["branchecvrNummer"],"outer")
                           )


root
 |-- cvrNummer: long (nullable = true)
 |-- vaerdi: string (nullable = true)
 |-- gyldigFra: string (nullable = true)
 |-- gyldigTil: string (nullable = true)
 |-- varighed: integer (nullable = true)
 |-- CumlativeDiff: double (nullable = true)
 |-- branchecvrNummer: long (nullable = true)
 |-- branchebranchekode: string (nullable = true)
 |-- branchebranchetekst: string (nullable = true)
 |-- branchesidstOpdateret: string (nullable = true)
 |-- branchegyldigFra: string (nullable = true)
 |-- branchegyldigTil: string (nullable = true)

+---------+------------+----------+----------+--------+-------------+----------------+------------------+--------------------+---------------------+----------------+----------------+
|cvrNummer|      vaerdi| gyldigFra| gyldigTil|varighed|CumlativeDiff|branchecvrNummer|branchebranchekode| branchebranchetekst|branchesidstOpdateret|branchegyldigFra|branchegyldigTil|
+---------+------------+----------+----------+--------+-------------+----------------+-

In [46]:
vaekstWithBrancher2015Df.show()

+---------+------------+----------+----------+--------+-------------+----------------+------------------+--------------------+---------------------+----------------+----------------+
|cvrNummer|      vaerdi| gyldigFra| gyldigTil|varighed|CumlativeDiff|branchecvrNummer|branchebranchekode| branchebranchetekst|branchesidstOpdateret|branchegyldigFra|branchegyldigTil|
+---------+------------+----------+----------+--------+-------------+----------------+------------------+--------------------+---------------------+----------------+----------------+
|     null|        null|      null|      null|    null|         null|        10194164|            981000|Private husholdni...| 2015-08-20T16:21:...|      2015-04-15|            null|
|     null|        null|      null|      null|    null|         null|        12920547|            683110| Ejendomsmæglere mv.| 2015-06-05T08:09:...|      2015-01-01|            null|
|     null|        null|      null|      null|    null|         null|        13054533