<img src="media/logo_psa.jpg" width="300">

<h1><center>Constructing SAMARA Data (2.1. SAMARA)</center></h1>

### Imports

In [1]:
%load_ext autoreload
%autoreload 2
import os
import datetime
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from pyspark.sql import functions as F

from distribution_cost.configuration import spark_config
from distribution_cost.configuration.app import AppConfig
from distribution_cost.configuration.data import DataConfig
from distribution_cost.infra import oracle
from distribution_cost.domain import kpis

/gpfs/user/e587246/dco00/conf/application.yml
/gpfs/user/e587246/dco00


In [2]:
# Database uri
app_config = AppConfig()

db_uri = app_config.db_uri_jdbc
db_uri_cx_oracle = app_config.db_uri_cx_oracle

In [3]:
# Data Config
data_config = DataConfig()

data_config.vhls_perimeter

{'sites': ['PY', 'MU'],
 'start_date': '15/01/20',
 'end_date': '17/01/20',
 'genr_door': 'EMON'}

In [4]:
sites = data_config.vhls_perimeter["sites"]
start_date = data_config.vhls_perimeter["start_date"]
end_date = data_config.vhls_perimeter["end_date"]
genr_door = data_config.vhls_perimeter["genr_door"]

In [5]:
# Create spark session
spark_context, spark_session = spark_config.get_spark(app_name="app-distribution-cost",
                                                      executors=4, executor_cores=4, executor_mem='16g',
                                                      dynamic_allocation=True, max_executors=8)

spark_session.conf.set("spark.sql.crossJoin.enabled", "true")

In [6]:
df_sinqtvin = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtvin/")
df_sinqtcli = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcli/")
df_sinqtver = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtver/")
df_sinqtfv4 = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtfv4/")
df_sinqtcmp = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcmp/")
df_sinqtcnd = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcnd/")
df_sinqtseg = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtseg/")
df_sinqtzds = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtzds/")
df_sinqtsfa = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtsfa/")
df_sinqtfam = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtfam/")
df_sinqtrub = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtrub/")
df_sinqtopc = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtopc/")
df_sinqtcma = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcma/")
df_sinqtcmi = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcmi/")
df_sinqtcyr = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcyr/")
df_sinqtmrq = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtmrq/")
df_sinqtbas = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtbas/")

In [7]:
df_sinqtvin.createOrReplaceTempView("df_sinqtvin")
df_sinqtver.createOrReplaceTempView("df_sinqtver")
df_sinqtfv4.createOrReplaceTempView("df_sinqtfv4")
df_sinqtcmp.createOrReplaceTempView("df_sinqtcmp")
df_sinqtcnd.createOrReplaceTempView("df_sinqtcnd")
df_sinqtcli.createOrReplaceTempView("df_sinqtcli")
df_sinqtseg.createOrReplaceTempView("df_sinqtseg")
df_sinqtzds.createOrReplaceTempView("df_sinqtzds")
df_sinqtsfa.createOrReplaceTempView("df_sinqtsfa")
df_sinqtfam.createOrReplaceTempView("df_sinqtfam")
df_sinqtrub.createOrReplaceTempView("df_sinqtrub")
df_sinqtopc.createOrReplaceTempView("df_sinqtopc")
df_sinqtcma.createOrReplaceTempView("df_sinqtcma")
df_sinqtcmi.createOrReplaceTempView("df_sinqtcmi")
df_sinqtcyr.createOrReplaceTempView("df_sinqtcyr")
df_sinqtmrq.createOrReplaceTempView("df_sinqtmrq")
df_sinqtbas.createOrReplaceTempView("df_sinqtbas")

In [8]:
querySAMARA = """
SELECT
    SINQTVIN.CODE SINQTVIN__CODE,
    SINQTCLI_2.CODE SINQTCLI_2__CODE,
    SINQTCLI_2.CODE_PAYS_IMPLANT,
    SINQTVER.CODE SINQTVER__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTVER.LIB_EN,SINQTVER.LIB_FR) 
when 'fr_FR' then SINQTVER.LIB_FR
when 'es_SP' then nvl(SINQTVER.LIB_ES,SINQTVER.LIB_FR)
else SINQTVER.LIB_FR

end SINQTVER__LIB,
    Table__54.DT_FACT,
    Table__54.DT_VD,
    Table__54.DT_COMM_CLI_FIN_VD,
    Table__54.DATIMM,
    SINQTCMP.CODE SINQTCMP__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTCMP.LIB_EN,SINQTCMP.LIB_FR)
when 'fr_FR' then SINQTCMP.LIB_FR
when 'es_SP' then nvl(SINQTCMP.LIB_ES,SINQTCMP.LIB_FR)
else SINQTCMP.LIB_FR

end SINQTCMP__LIB,
    SINQTCND.CODE SINQTCND__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTCND.LIB_EN,SINQTCND.LIB_FR)
when 'fr_FR' then SINQTCND.LIB_FR
when 'es_SP' then nvl(SINQTCND.LIB_ES,SINQTCND.LIB_FR)
else SINQTCND.LIB_FR

end SINQTCND__LIB,
    SINQTCLI.CODE SINQTCLI__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTCLI.LIB_EN,SINQTCLI.LIB_FR)
when 'fr_FR' then SINQTCLI.LIB_FR
when 'es_SP' then nvl(SINQTCLI.LIB_ES,SINQTCLI.LIB_FR)
else SINQTCLI.LIB_FR

end SINQTCLI__LIB,
    SINQTSEG.CODE SINQTSEG__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTSEG.LIB_EN,SINQTSEG.LIB_FR)
when 'fr_FR' then SINQTSEG.LIB_FR
when 'es_SP' then nvl(SINQTSEG.LIB_ES,SINQTSEG.LIB_FR)
else SINQTSEG.LIB_FR

end SINQTSEG__LIB,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTZDS.LIB_EN,SINQTZDS.LIB_FR)
when 'fr_FR' then SINQTZDS.LIB_FR
when 'es_SP' then nvl(SINQTZDS.LIB_ES,SINQTZDS.LIB_FR)
else SINQTZDS.LIB_FR

end SINQTZDS__LIB,
    SINQTSFA.CODE SINQTSFA__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTSFA.LIB_EN,SINQTSFA.LIB_FR)
when 'fr_FR' then SINQTSFA.LIB_FR
when 'es_SP' then nvl(SINQTSFA.LIB_ES,SINQTSFA.LIB_FR)
else SINQTSFA.LIB_FR

end SINQTSFA__LIB,
    SINQTFAM.CODE SINQTFAM__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTFAM.LIB_EN,SINQTFAM.LIB_FR)
when 'fr_FR' then SINQTFAM.LIB_FR
when 'es_SP' then nvl(SINQTFAM.LIB_ES,SINQTFAM.LIB_FR)
else SINQTFAM.LIB_FR

end SINQTFAM__LIB,
    SINQTRUB.CODE SINQTRUB__CODE,
    case 'fr_FR'
when 'en_GB' then SINQTRUB.LIB_EN
when 'fr_FR' then SINQTRUB.LIB_FR
when 'es_SP' then SINQTRUB.LIB_ES
else SINQTRUB.LIB_FR

end SINQTRUB__LIB,
    SINQTOPC.CODE SINQTOPC__CODE,
    case 'fr_FR'
when 'en_GB' then SINQTOPC.LIB_EN
when 'fr_FR' then SINQTOPC.LIB_FR
when 'es_SP' then SINQTOPC.LIB_ES
else SINQTOPC.LIB_FR

end SINQTOPC_LIB,
    SINQTCMA.CODE SINQTCMA__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTCMA.LIB_EN,SINQTCMA.LIB_FR)
when 'fr_FR' then SINQTCMA.LIB_FR
when 'es_SP' then nvl(SINQTCMA.LIB_ES,SINQTCMA.LIB_FR)
else SINQTCMA.LIB_FR

end SINQTCMA__LIB,
    SINQTCMI.CODE SINQTCMI__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTCMI.LIB_EN,SINQTCMI.LIB_FR)
when 'fr_FR' then SINQTCMI.LIB_FR
when 'es_SP' then nvl(SINQTCMI.LIB_ES,SINQTCMI.LIB_FR)
else SINQTCMI.LIB_FR

end SINQTCMI__LIB,
    Table__54.TYPE_FLOTTE_VD,
    Table__54.TYPE_OPE_ESSOR,
    Table__54.TYP_UTIL_VD,
    Table__54.CODE_PROFESSION_VD,
    Table__54.CODE_PROMO,
    Table__54.CODE_PROMO2,
    SINQTCYR.ANNEE_MOIS,
    sum(Table__54.VOLUME_AJ) VOLUME_AJ,
    sum(Table__54.PRIX_VENTE) PRIX_VENTE,
    sum(Table__54.PRIX_VENTE_AJ) PRIX_VENTE_AJ,
    sum(Table__54.PV_OPTIONS) PV_OPTIONS,
    sum(Table__54.PV_VERSION) PV_VERSION,
    sum(Table__54.MACOM_CONSO) MACOM_CONSO,
    sum(Table__54.MACOM_CONSO_AJ) MACOM_CONSO_AJ,
    sum(Table__54.MACOM_CONSO_VERSION) MACOM_CONSO_VERSION,
    sum(Table__54.MACOM_CONSO_OPTION) MACOM_CONSO_OPTION,
    sum(Table__54.MACOM_ENTITE) MACOM_ENTITE,
    sum(Table__54.MACOM_ENTITE_AJ) MACOM_ENTITE_AJ,
    sum(Table__54.MACOM_ENTITE_VERSION) MACOM_ENTITE_VERSION,
    sum(Table__54.MACOM_ENTITE_OPTION) MACOM_ENTITE_OPTION,
    sum(Table__54.RBCV_AJ) RBCV_AJ,
    -- sum(Table__54.MCX_VARIABLES) MCX_VARIABLES,
    SINQTMRQ_2.CODE SINQTMRQ_2__CODE
FROM
    -- df_sinqtvin SINQTVIN,--
    -- df_sinqtcli SINQTCLI_2,--
    -- df_sinqtver SINQTVER,--
    df_sinqtfv4 TABLE__54,
    -- df_sinqtcmp SINQTCMP,--
    -- df_sinqtcnd SINQTCND,--
    -- df_sinqtcli SINQTCLI,--
    -- df_sinqtseg SINQTSEG,--
    -- df_sinqtzds SINQTZDS,--
    -- df_sinqtsfa SINQTSFA,--
    -- df_sinqtfam SINQTFAM,--
    -- df_sinqtrub SINQTRUB,
    -- df_sinqtopc SINQTOPC,--
    -- df_sinqtcma SINQTCMA,--
    -- df_sinqtcmi SINQTCMI,--
    -- df_sinqtcyr SINQTCYR,--
    -- df_sinqtmrq SINQTMRQ_2,--
    df_sinqtbas SINQTBAS,--
    df_sinqtfam SINQTFAM_2--
    JOIN df_sinqtcmp SINQTCMP ON Table__54.ID_ZDS=SINQTCMP.ID_ZDS and Table__54.ID_CMP=SINQTCMP.ID
    JOIN df_sinqtcli SINQTCLI ON Table__54.ID_ZDS=SINQTCLI.ID_ZDS and Table__54.ID_SCD=SINQTCLI.ID_SCD and Table__54.ID_CLI=SINQTCLI.ID
    JOIN df_sinqtcmi SINQTCMI ON Table__54.ID_ZDS=SINQTCMI.ID_ZDS and Table__54.ID_CMI=SINQTCMI.ID
    --JOIN df_sinqtbas SINQTBAS ON Table__54.ID_BAS=SINQTBAS.ID
    JOIN df_sinqtzds SINQTZDS ON Table__54.ID_ZDS=SINQTZDS.ID
    LEFT OUTER JOIN df_sinqtrub SINQTRUB ON Table__54.ID_RUB=SINQTRUB.ID
    JOIN df_sinqtfam SINQTFAM ON Table__54.ID_ZDS=SINQTFAM.ID_ZDS and Table__54.ID_FAM=SINQTFAM.ID
    JOIN df_sinqtsfa SINQTSFA ON Table__54.ID_ZDS=SINQTSFA.ID_ZDS and Table__54.ID_SFA=SINQTSFA.ID
    JOIN df_sinqtver SINQTVER ON Table__54.ID_ZDS=SINQTVER.ID_ZDS and Table__54.ID_VER=SINQTVER.ID
    LEFT OUTER JOIN df_sinqtseg SINQTSEG ON Table__54.ID_ZDS=SINQTSEG.ID_ZDS and Table__54.ID_SEG=SINQTSEG.ID
    JOIN df_sinqtcnd SINQTCND ON Table__54.ID_CND=SINQTCND.ID
    JOIN df_sinqtcma SINQTCMA ON Table__54.ID_CMA=SINQTCMA.ID
    LEFT OUTER JOIN df_sinqtopc SINQTOPC ON Table__54.ID_ZDS=SINQTOPC.ID_ZDS and Table__54.ID_OPC=SINQTOPC.ID
    JOIN df_sinqtvin SINQTVIN ON Table__54.ID_ZDS=SINQTVIN.ID_ZDS and Table__54.ID_VIN=SINQTVIN.ID
    JOIN df_sinqtcyr SINQTCYR ON Table__54.ID_CYC=SINQTCYR.ID
    JOIN df_sinqtcli SINQTCLI_2 ON Table__54.ID_ZDS=SINQTCLI_2.ID_ZDS and Table__54.ID_SCD=SINQTCLI_2.ID_SCD and Table__54.ID_CLI_LIV=SINQTCLI_2.ID
    JOIN df_sinqtmrq SINQTMRQ_2 ON SINQTFAM_2.ID_MRQ_COM=SINQTMRQ_2.ID and Table__54.ID_ZDS=SINQTFAM_2.ID_ZDS and Table__54.ID_FAM=SINQTFAM_2.ID    

WHERE   ( Table__54.DT_VD BETWEEN TO_DATE('01/01/2019', 'dd/MM/yyyy') AND TO_DATE('30/06/2019', 'dd/MM/yyyy'))
    AND ( SINQTBAS.CODE  =  'LA' AND SINQTCLI_2.CODE_PAYS_IMPLANT  IN  ( 'FR','DE','PT','BE','IT','ES','GB','NL','PL','AT'  ) AND ( SINQTBAS.CODE != 'EA'  ))
GROUP BY
  SINQTVIN.CODE, 
  SINQTCLI_2.CODE, 
  SINQTCLI_2.CODE_PAYS_IMPLANT, 
  SINQTVER.CODE, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTVER.LIB_EN,SINQTVER.LIB_FR)
when 'fr_FR' then SINQTVER.LIB_FR
when 'es_SP' then nvl(SINQTVER.LIB_ES,SINQTVER.LIB_FR)
else SINQTVER.LIB_FR

end, 
  Table__54.DT_FACT, 
  Table__54.DT_VD, 
  Table__54.DT_COMM_CLI_FIN_VD, 
  Table__54.DATIMM, 
  SINQTCMP.CODE, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTCMP.LIB_EN,SINQTCMP.LIB_FR)
when 'fr_FR' then SINQTCMP.LIB_FR
when 'es_SP' then nvl(SINQTCMP.LIB_ES,SINQTCMP.LIB_FR)
else SINQTCMP.LIB_FR

end, 
  SINQTCND.CODE, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTCND.LIB_EN,SINQTCND.LIB_FR)
when 'fr_FR' then SINQTCND.LIB_FR
when 'es_SP' then nvl(SINQTCND.LIB_ES,SINQTCND.LIB_FR)
else SINQTCND.LIB_FR

end, 
  SINQTCLI.CODE, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTCLI.LIB_EN,SINQTCLI.LIB_FR)
when 'fr_FR' then SINQTCLI.LIB_FR
when 'es_SP' then nvl(SINQTCLI.LIB_ES,SINQTCLI.LIB_FR)
else SINQTCLI.LIB_FR

end, 
  SINQTSEG.CODE, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTSEG.LIB_EN,SINQTSEG.LIB_FR)
when 'fr_FR' then SINQTSEG.LIB_FR
when 'es_SP' then nvl(SINQTSEG.LIB_ES,SINQTSEG.LIB_FR)
else SINQTSEG.LIB_FR

end, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTZDS.LIB_EN,SINQTZDS.LIB_FR)
when 'fr_FR' then SINQTZDS.LIB_FR
when 'es_SP' then nvl(SINQTZDS.LIB_ES,SINQTZDS.LIB_FR)
else SINQTZDS.LIB_FR

end, 
  SINQTSFA.CODE, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTSFA.LIB_EN,SINQTSFA.LIB_FR)
when 'fr_FR' then SINQTSFA.LIB_FR
when 'es_SP' then nvl(SINQTSFA.LIB_ES,SINQTSFA.LIB_FR)
else SINQTSFA.LIB_FR

end, 
  SINQTFAM.CODE, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTFAM.LIB_EN,SINQTFAM.LIB_FR)
when 'fr_FR' then SINQTFAM.LIB_FR
when 'es_SP' then nvl(SINQTFAM.LIB_ES,SINQTFAM.LIB_FR)
else SINQTFAM.LIB_FR

end, 
  SINQTRUB.CODE, 
  case 'fr_FR'
when 'en_GB' then SINQTRUB.LIB_EN
when 'fr_FR' then SINQTRUB.LIB_FR
when 'es_SP' then SINQTRUB.LIB_ES
else SINQTRUB.LIB_FR

end, 
  SINQTOPC.CODE, 
  case 'fr_FR'
when 'en_GB' then SINQTOPC.LIB_EN
when 'fr_FR' then SINQTOPC.LIB_FR
when 'es_SP' then SINQTOPC.LIB_ES
else SINQTOPC.LIB_FR

end, 
  SINQTCMA.CODE, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTCMA.LIB_EN,SINQTCMA.LIB_FR)
when 'fr_FR' then SINQTCMA.LIB_FR
when 'es_SP' then nvl(SINQTCMA.LIB_ES,SINQTCMA.LIB_FR)
else SINQTCMA.LIB_FR

end, 
  SINQTCMI.CODE, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTCMI.LIB_EN,SINQTCMI.LIB_FR)
when 'fr_FR' then SINQTCMI.LIB_FR
when 'es_SP' then nvl(SINQTCMI.LIB_ES,SINQTCMI.LIB_FR)
else SINQTCMI.LIB_FR

end, 
  Table__54.TYPE_FLOTTE_VD, 
  Table__54.TYPE_OPE_ESSOR, 
  Table__54.TYP_UTIL_VD, 
  Table__54.CODE_PROFESSION_VD, 
  Table__54.CODE_PROMO, 
  Table__54.CODE_PROMO2, 
  SINQTCYR.ANNEE_MOIS, 
  SINQTMRQ_2.CODE
"""

In [9]:
dfSAMARA = spark_session.sql(querySAMARA)

In [10]:
dfSAMARA.count()

10072216

In [11]:
dfSAMARA.toPandas().Head(10)
#dfSAMARA.columns

Py4JJavaError: An error occurred while calling o156.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Total size of serialized results of 62 tasks (1031.2 MB) is bigger than spark.driver.maxResultSize (1024.0 MB)
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:297)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply$mcI$sp(Dataset.scala:3195)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3192)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3192)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:3225)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3192)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [10]:
dfSAMARA.count()

KeyboardInterrupt: 

In [117]:
dfSAMARA.select("").distinct().toPandas().head(10)

Unnamed: 0,LIBELLE
0,MANGUALDE
1,
2,RENNES
3,SEVEL NORD
4,MIZUSHIMA
5,VILLAVERDE
6,SEVEL-VAL DI SANGRO
7,MULHOUSE
8,VIGO
9,KOLIN


In [None]:
dfSAMARA = dfSAMARA.drop("", "")