# RA data


TODO:

- add "df_" as prefix for all DataFrames

(- remove duplicated columns, e.g. $"lpk_ver_nr" === $"ver_nr")




In [1]:
spark.catalog.listTables.show(false)

+------------------+--------+-----------+---------+-----------+
|name              |database|description|tableType|isTemporary|
+------------------+--------+-----------+---------+-----------+
|stg_aufenthaltsart|default |null       |EXTERNAL |false      |
|stg_deckung       |default |null       |EXTERNAL |false      |
|stg_ecdetail      |default |null       |EXTERNAL |false      |
|stg_eckopf        |default |null       |EXTERNAL |false      |
|stg_ecprodukt     |default |null       |EXTERNAL |false      |
|stg_familie       |default |null       |EXTERNAL |false      |
|stg_gemeinde      |default |null       |EXTERNAL |false      |
|stg_leipkopf      |default |null       |EXTERNAL |false      |
|stg_leippos       |default |null       |EXTERNAL |false      |
|stg_schadenart    |default |null       |EXTERNAL |false      |
|stg_tariffaktor   |default |null       |EXTERNAL |false      |
|stg_versdeckung   |default |null       |EXTERNAL |false      |
|stg_versicherter  |default |null       

In [2]:
spark.table("default.btl_ra_data").show()

+------+------------+------+
|lpp_id|lpp_pos_betr|AHV_Nr|
+------+------------+------+
+------+------------+------+



In [3]:
spark.table("default.stg_ECProdukt").printSchema()

root
 |-- ecp_id: integer (nullable = true)
 |-- ecp_ept_id: integer (nullable = true)
 |-- ecp_produkt_nr: string (nullable = true)
 |-- ecp_produkt_name: string (nullable = true)
 |-- ecp_iks_cd: string (nullable = true)
 |-- ecp_mut_uid: short (nullable = true)
 |-- ecp_mut_dat: timestamp (nullable = true)



In [4]:
val leiPPos = spark.table("default.stg_LeiPPos").as("leippos")
val leiPKopf = spark.table("default.stg_LeiPKopf").as("leipkopf")
val vers = spark.table("default.stg_Versicherter")
val familie = spark.table("default.stg_Familie")
val gemeinde = spark.table("default.stg_Gemeinde")
val ecKopf = spark.table("default.stg_ECKopf")
val ecDetail = spark.table("default.stg_ECDetail")
val ecProdukt = spark.table("default.stg_ECProdukt")

val join_type = "inner" // "left_semi" would only take rows whicht match

val df = leiPPos
      // Jahr
      .withColumnRenamed("LPP_BEH_BEG_DAT", "Jahr")
      
      // AHV-Nr: LeiPPos.LPP_LPK_ID → LeiPKopf.LPK_VER_NR → Versicherter.VER_NNSS_NR
      .join(leiPKopf, $"lpp_lpk_id" === $"lpk_id", join_type)
      .join(vers, $"lpk_ver_nr" === $"ver_nr", join_type)
      .withColumnRenamed("ver_nnss_nr", "AHV_Nr")

      // Wohnkanton: LeiPPos.LPP_LPK_ID → LeiPKopf.LPK_VER_NR → Versicherter.VER_FAM_NR → Familie.FAM_RA_GDE_ID (Test auf Zeitraum) → Gemeinde.GDE_KT
      .join(familie, $"ver_fam_nr" === $"fam_nr", join_type)
      .join(gemeinde, $"fam_ra_gde_id" === $"gde_id", join_type)
      .withColumnRenamed("gde_kt", "Wohnkanton")

      // Geburtsjahr: LeiPPos.LPP_LPK_ID → LeiPKopf.LPK_VER_NR → Versicherter.VER_GEB_DAT
      .withColumnRenamed("ver_geb_dat", "Geburtsjahr")

      // Geschlecht: LeiPPos.LPP_LPK_ID → LeiPKopf.LPK_VER_NR → Versicherter.VER_SEX_CD
      .withColumnRenamed("ver_sex_cd", "Geschlecht")

      // Spital Aufenthalt:
      // Exakte Ein/Austritte Spital/Reha o.ä. ECP: LeiPPos.LPP_LPK_ID → LeiPKopf.LPK_RECH_NR → ECKopf.ECK_RECH_NR, ~.ECK_ID → ECKopfXtraCaseDetail.EXC_ECK_ID, ~.EXC_BEG_DAT, ~.EXC_END_DAT (korrekte Summierung)
      // Exakte Ein/Austritte Spital/Reha o.ä. Sumex/Secon: LeiPPos.LPP_LPK_ID → LpkCaseDetail.LCD_BEG_DAT,~.LCD_END_DAT 
      // TODO

      // GTIN: LeiPPos.LPP_LPK_ID → LeiPKopf.LPK_RECH_NR → ECKopf.ECK_RECH_NR, ~.ECK_ID → ECDetail.ECD_ECP_ID→ ECProdukt.ECP_PRODUKT_NR (PharmaCode)
      .join(ecKopf, $"LPK_RECH_NR" === $"ECK_RECH_NR", join_type)
      .join(ecDetail, $"ECK_ID" === $"ECD_ECK_ID", join_type)
      .join(ecProdukt, $"ECD_ECP_ID" === $"ECP_ID", join_type)
      .withColumnRenamed("ecp_produkt_nr", "GTIN")
      

      
      .select("Jahr", "AHV_Nr", "Wohnkanton", "Geburtsjahr", "Geschlecht", "GTIN")

df.printSchema()

root
 |-- Jahr: timestamp (nullable = true)
 |-- AHV_Nr: string (nullable = true)
 |-- Wohnkanton: string (nullable = true)
 |-- Geburtsjahr: timestamp (nullable = true)
 |-- Geschlecht: string (nullable = true)
 |-- GTIN: string (nullable = true)

