* Master DAC - BDLE
* Author: Mohamed-Amine Baazizi
* Affiliation: LIP6 - Faculté des Sciences - Sorbonne Université
* Email: mohamed-amine.baazizi@lip6.fr
* October 2023

# Spark Setup (with Deequ enabled)

In [1]:
!pip install --upgrade -q pyspark==3.3

In [2]:
import os
os.environ["SPARK_VERSION"] = "3.3"

In [3]:

!pip install  --upgrade -q pydeequ

In [4]:
!pip list|grep pydeequ

pydeequ                          1.1.1


In [5]:
from pyspark.sql import SparkSession, Row
import pydeequ

spark = SparkSession.builder\
    .master("local")\
    .appName("pyDeequ")\
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)\
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)\
    .getOrCreate()

In [6]:
spark

# ML

## vectors

In [7]:
from pyspark.ml.linalg import Vectors


In [8]:
vec1 = Vectors.dense(1.0, 1.0, 18.0)
vec2 = Vectors.dense(0.0, 2.0, 20.0)
vec3 = Vectors.sparse(3,[0.0, 2.0],[1.0,18.0])
vec4 = Vectors.sparse(3,[0.0,1.2,2.0],[2.0,3.0,11.0])
vectors = spark.sparkContext.parallelize([vec1,vec2,vec3,vec4])
vectors.collect()
#vectors.printSchema()
#vectors.show()

[DenseVector([1.0, 1.0, 18.0]),
 DenseVector([0.0, 2.0, 20.0]),
 SparseVector(3, {0: 1.0, 2: 18.0}),
 SparseVector(3, {0: 2.0, 1: 3.0, 2: 11.0})]

## DT data loading

In [9]:
tuples = [("young","high","no","fair","no"),
               ("young","high","no","excellent","no"),
               ("middle","high","no","fair","yes"),
               ("senior","medium","no","fair","yes"),
               ("senior","low","yes","fair","yes"),
               ("senior","low","yes","excellent","no"),
               ("middle","low","yes","excellent","yes"),
               ("young","medium","no","fair","no"),
               ("young","low","yes","fair","yes"),
               ("senior","medium","yes","fair","yes"),
               ("young","medium","yes","excellent","yes"),
               ("middle","medium","no","excellent","yes"),
               ("middle","high","yes","fair","yes"),
               ("senior","medium","no","excellent","no")]
print(len(tuples))

14


In [10]:
schema = 'age string, income string, student string, credit_rating string, label string'
data = spark.sparkContext.parallelize(tuples).toDF(schema)
data.printSchema()
data.show()

root
 |-- age: string (nullable = true)
 |-- income: string (nullable = true)
 |-- student: string (nullable = true)
 |-- credit_rating: string (nullable = true)
 |-- label: string (nullable = true)

+------+------+-------+-------------+-----+
|   age|income|student|credit_rating|label|
+------+------+-------+-------------+-----+
| young|  high|     no|         fair|   no|
| young|  high|     no|    excellent|   no|
|middle|  high|     no|         fair|  yes|
|senior|medium|     no|         fair|  yes|
|senior|   low|    yes|         fair|  yes|
|senior|   low|    yes|    excellent|   no|
|middle|   low|    yes|    excellent|  yes|
| young|medium|     no|         fair|   no|
| young|   low|    yes|         fair|  yes|
|senior|medium|    yes|         fair|  yes|
| young|medium|    yes|    excellent|  yes|
|middle|medium|     no|    excellent|  yes|
|middle|  high|    yes|         fair|  yes|
|senior|medium|     no|    excellent|   no|
+------+------+-------+-------------+-----+



## Transformations

### String indexer

In [11]:
from  pyspark.ml.feature import StringIndexer

In [12]:
field = 'age'
age_indexer = StringIndexer(inputCol=field,outputCol='indexed_'+field)
df_age_idx = age_indexer.fit(data).transform(data)
df_age_idx.show()


+------+------+-------+-------------+-----+-----------+
|   age|income|student|credit_rating|label|indexed_age|
+------+------+-------+-------------+-----+-----------+
| young|  high|     no|         fair|   no|        1.0|
| young|  high|     no|    excellent|   no|        1.0|
|middle|  high|     no|         fair|  yes|        2.0|
|senior|medium|     no|         fair|  yes|        0.0|
|senior|   low|    yes|         fair|  yes|        0.0|
|senior|   low|    yes|    excellent|   no|        0.0|
|middle|   low|    yes|    excellent|  yes|        2.0|
| young|medium|     no|         fair|   no|        1.0|
| young|   low|    yes|         fair|  yes|        1.0|
|senior|medium|    yes|         fair|  yes|        0.0|
| young|medium|    yes|    excellent|  yes|        1.0|
|middle|medium|     no|    excellent|  yes|        2.0|
|middle|  high|    yes|         fair|  yes|        2.0|
|senior|medium|     no|    excellent|   no|        0.0|
+------+------+-------+-------------+-----+-----

In [13]:
def string_index_cols(cols,prefix):
  outCols = map(lambda c:prefix+c, cols)
  # return list(outCols)
  return StringIndexer(inputCols=cols,outputCols=list(outCols))


# si = index_cols(['age','income'])
# si.getOutputCols()

In [14]:
prefix = 'indexed_'
fields = ['age','income']
age_income_indexer = string_index_cols(fields,prefix)
df_age_income_idx = age_income_indexer.fit(data).transform(data)
df_age_income_idx.show()

+------+------+-------+-------------+-----+-----------+--------------+
|   age|income|student|credit_rating|label|indexed_age|indexed_income|
+------+------+-------+-------------+-----+-----------+--------------+
| young|  high|     no|         fair|   no|        1.0|           1.0|
| young|  high|     no|    excellent|   no|        1.0|           1.0|
|middle|  high|     no|         fair|  yes|        2.0|           1.0|
|senior|medium|     no|         fair|  yes|        0.0|           0.0|
|senior|   low|    yes|         fair|  yes|        0.0|           2.0|
|senior|   low|    yes|    excellent|   no|        0.0|           2.0|
|middle|   low|    yes|    excellent|  yes|        2.0|           2.0|
| young|medium|     no|         fair|   no|        1.0|           0.0|
| young|   low|    yes|         fair|  yes|        1.0|           2.0|
|senior|medium|    yes|         fair|  yes|        0.0|           0.0|
| young|medium|    yes|    excellent|  yes|        1.0|           0.0|
|middl

### IndexToString

In [15]:
from pyspark.ml.feature import IndexToString


In [16]:
age_rev_indexer = IndexToString(inputCol=age_indexer.getOutputCol(),outputCol='original_age')

df_orig_age =age_rev_indexer.transform(df_age_idx)
df_orig_age.show()


+------+------+-------+-------------+-----+-----------+------------+
|   age|income|student|credit_rating|label|indexed_age|original_age|
+------+------+-------+-------------+-----+-----------+------------+
| young|  high|     no|         fair|   no|        1.0|       young|
| young|  high|     no|    excellent|   no|        1.0|       young|
|middle|  high|     no|         fair|  yes|        2.0|      middle|
|senior|medium|     no|         fair|  yes|        0.0|      senior|
|senior|   low|    yes|         fair|  yes|        0.0|      senior|
|senior|   low|    yes|    excellent|   no|        0.0|      senior|
|middle|   low|    yes|    excellent|  yes|        2.0|      middle|
| young|medium|     no|         fair|   no|        1.0|       young|
| young|   low|    yes|         fair|  yes|        1.0|       young|
|senior|medium|    yes|         fair|  yes|        0.0|      senior|
| young|medium|    yes|    excellent|  yes|        1.0|       young|
|middle|medium|     no|    excelle

### one-hot encoder

In [17]:
from pyspark.ml.feature import OneHotEncoder


In [18]:
age_onehotenc = OneHotEncoder(inputCol=age_indexer.getOutputCol(),outputCol='cat_age')
age_onehotenc.setDropLast(False)
df_age_onehot = age_onehotenc.fit(df_age_idx).transform(df_age_idx)
df_age_onehot.show()\
#   .setInputCols(Array("indexed_age", "indexed_income"))\
#   .setOutputCols(Array("category_age", "category_income"))\
#   .setDropLast(False)

#val_encoded = OneHotEncoder.fit(data).transform(data)

+------+------+-------+-------------+-----+-----------+-------------+
|   age|income|student|credit_rating|label|indexed_age|      cat_age|
+------+------+-------+-------------+-----+-----------+-------------+
| young|  high|     no|         fair|   no|        1.0|(3,[1],[1.0])|
| young|  high|     no|    excellent|   no|        1.0|(3,[1],[1.0])|
|middle|  high|     no|         fair|  yes|        2.0|(3,[2],[1.0])|
|senior|medium|     no|         fair|  yes|        0.0|(3,[0],[1.0])|
|senior|   low|    yes|         fair|  yes|        0.0|(3,[0],[1.0])|
|senior|   low|    yes|    excellent|   no|        0.0|(3,[0],[1.0])|
|middle|   low|    yes|    excellent|  yes|        2.0|(3,[2],[1.0])|
| young|medium|     no|         fair|   no|        1.0|(3,[1],[1.0])|
| young|   low|    yes|         fair|  yes|        1.0|(3,[1],[1.0])|
|senior|medium|    yes|         fair|  yes|        0.0|(3,[0],[1.0])|
| young|medium|    yes|    excellent|  yes|        1.0|(3,[1],[1.0])|
|middle|medium|     

=> (3,[0],[1.0]) signifie un vecteur de taille 3 avec la première valeur égale à 1 et les autres à 0. Cela signifie que la première catégorie a été encodée (dans ce cas, l'âge "senior").

### vector assembler

In [19]:
from pyspark.ml.feature import VectorAssembler

In [20]:
cols = ['indexed_age','indexed_income']
vec_assembler = VectorAssembler(inputCols= cols, outputCol= 'ageIncomeVec')

df_age_income_vec = vec_assembler.transform(df_age_income_idx)
df_age_income_vec.show()

+------+------+-------+-------------+-----+-----------+--------------+------------+
|   age|income|student|credit_rating|label|indexed_age|indexed_income|ageIncomeVec|
+------+------+-------+-------------+-----+-----------+--------------+------------+
| young|  high|     no|         fair|   no|        1.0|           1.0|   [1.0,1.0]|
| young|  high|     no|    excellent|   no|        1.0|           1.0|   [1.0,1.0]|
|middle|  high|     no|         fair|  yes|        2.0|           1.0|   [2.0,1.0]|
|senior|medium|     no|         fair|  yes|        0.0|           0.0|   (2,[],[])|
|senior|   low|    yes|         fair|  yes|        0.0|           2.0|   [0.0,2.0]|
|senior|   low|    yes|    excellent|   no|        0.0|           2.0|   [0.0,2.0]|
|middle|   low|    yes|    excellent|  yes|        2.0|           2.0|   [2.0,2.0]|
| young|medium|     no|         fair|   no|        1.0|           0.0|   [1.0,0.0]|
| young|   low|    yes|         fair|  yes|        1.0|           2.0|   [1.

### Vector Indexer

In [21]:
from pyspark.ml.feature import VectorIndexer


In [22]:
vecIndexer = VectorIndexer(inputCol='ageIncomeVec',\
                           outputCol='indexed_ageIncomeVec',\
                           maxCategories=3)
df_age_income_vec_idx = vecIndexer.fit(df_age_income_vec).\
    transform(df_age_income_vec)

df_age_income_vec_idx.show()


+------+------+-------+-------------+-----+-----------+--------------+------------+--------------------+
|   age|income|student|credit_rating|label|indexed_age|indexed_income|ageIncomeVec|indexed_ageIncomeVec|
+------+------+-------+-------------+-----+-----------+--------------+------------+--------------------+
| young|  high|     no|         fair|   no|        1.0|           1.0|   [1.0,1.0]|           [1.0,1.0]|
| young|  high|     no|    excellent|   no|        1.0|           1.0|   [1.0,1.0]|           [1.0,1.0]|
|middle|  high|     no|         fair|  yes|        2.0|           1.0|   [2.0,1.0]|           [2.0,1.0]|
|senior|medium|     no|         fair|  yes|        0.0|           0.0|   (2,[],[])|           (2,[],[])|
|senior|   low|    yes|         fair|  yes|        0.0|           2.0|   [0.0,2.0]|           [0.0,2.0]|
|senior|   low|    yes|    excellent|   no|        0.0|           2.0|   [0.0,2.0]|           [0.0,2.0]|
|middle|   low|    yes|    excellent|  yes|        2.0|

=>  maxCategories=3 :  signifie que VectorIndexer ne traitera que les variables catégorielles qui ont au plus 3 catégories distinctes. Si une variable a plus de 3 catégories, elle ne sera pas indexée.

## Pipelines

#### string indexer

In [23]:
data.show()

+------+------+-------+-------------+-----+
|   age|income|student|credit_rating|label|
+------+------+-------+-------------+-----+
| young|  high|     no|         fair|   no|
| young|  high|     no|    excellent|   no|
|middle|  high|     no|         fair|  yes|
|senior|medium|     no|         fair|  yes|
|senior|   low|    yes|         fair|  yes|
|senior|   low|    yes|    excellent|   no|
|middle|   low|    yes|    excellent|  yes|
| young|medium|     no|         fair|   no|
| young|   low|    yes|         fair|  yes|
|senior|medium|    yes|         fair|  yes|
| young|medium|    yes|    excellent|  yes|
|middle|medium|     no|    excellent|  yes|
|middle|  high|    yes|         fair|  yes|
|senior|medium|     no|    excellent|   no|
+------+------+-------+-------------+-----+



In [24]:
label = 'label'
features_col = data.columns
features_col.remove(label)

In [25]:
data.columns

['age', 'income', 'student', 'credit_rating', 'label']

In [26]:
features_col

['age', 'income', 'student', 'credit_rating']

In [27]:
prefix = 'indexed_'

In [28]:
label_string_indexer = StringIndexer(inputCol=label, outputCol=prefix+label)

In [29]:
features_str_col = list(map(lambda c:prefix+c, features_col))
features_string_indexer = StringIndexer(inputCols=features_col,outputCols=features_str_col)


#### vector assembler and indexer

In [30]:
vec_assembler = VectorAssembler(inputCols= features_string_indexer.getOutputCols(), outputCol= 'vector')


In [31]:

vec_indexer = VectorIndexer(inputCol='vector',\
                            outputCol='features',\
                           maxCategories=3)

#### pipeline building

In [32]:
stages = [label_string_indexer,features_string_indexer,vec_assembler,vec_indexer]

In [33]:
stages

[StringIndexer_48af7fe58ca7,
 StringIndexer_614e9b0b69c9,
 VectorAssembler_143cef25393f,
 VectorIndexer_af9df6382ba1]

In [34]:
from pyspark.ml import Pipeline


In [35]:
pipeline = Pipeline(stages = stages)
train_data = pipeline.fit(data).transform(data).select("features","indexed_label")
train_data.show()


+-----------------+-------------+
|         features|indexed_label|
+-----------------+-------------+
|[1.0,1.0,0.0,0.0]|          1.0|
|[1.0,1.0,0.0,1.0]|          1.0|
|[2.0,1.0,0.0,0.0]|          0.0|
|        (4,[],[])|          0.0|
|[0.0,2.0,1.0,0.0]|          0.0|
|[0.0,2.0,1.0,1.0]|          1.0|
|[2.0,2.0,1.0,1.0]|          0.0|
|    (4,[0],[1.0])|          1.0|
|[1.0,2.0,1.0,0.0]|          0.0|
|    (4,[2],[1.0])|          0.0|
|[1.0,0.0,1.0,1.0]|          0.0|
|[2.0,0.0,0.0,1.0]|          0.0|
|[2.0,1.0,1.0,0.0]|          0.0|
|    (4,[3],[1.0])|          1.0|
+-----------------+-------------+




*   **label_string_indexer :** convertir la colonne de libellé 'label' en une forme numérique.
*   **features_string_indexer :** convertir toutes les autres colonnes du jeu de données (à l'exception de la colonne de 'label') en formes numériques.
*   **vec_assembler :** assembler toutes les colonnes indexées en un seul vecteur de caractéristiques (créer un vecteur unique contenant toutes les caractéristiques qui seront utilisées pour l'apprentissage automatique).
*   **vec_indexer :** examine le vecteur de caractéristiques et, si nécessaire, indexe automatiquement les variables catégorielles présentes dans le vecteur. La colonne de sortie indexée est appelée 'features', qui est généralement la colonne d'entrée attendue pour de nombreux modèles d'apprentissage automatique.

## DT inference

In [36]:
from pyspark.ml.classification import DecisionTreeClassificationModel, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [37]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol= "indexed_label")
dtModel = dt.fit(train_data)
dtModel

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_eb084f5ef648, depth=4, numNodes=13, numClasses=2, numFeatures=4



*   **featuresCol :** caractéristiques
*   **labelCol :** label (étiquette)



In [38]:
print(dtModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_eb084f5ef648, depth=4, numNodes=13, numClasses=2, numFeatures=4
  If (feature 0 in {2.0})
   Predict: 0.0
  Else (feature 0 not in {2.0})
   If (feature 2 in {1.0})
    If (feature 3 in {0.0})
     Predict: 0.0
    Else (feature 3 not in {0.0})
     If (feature 0 in {1.0})
      Predict: 0.0
     Else (feature 0 not in {1.0})
      Predict: 1.0
   Else (feature 2 not in {1.0})
    If (feature 0 in {0.0})
     If (feature 3 in {0.0})
      Predict: 0.0
     Else (feature 3 not in {0.0})
      Predict: 1.0
    Else (feature 0 not in {0.0})
     Predict: 1.0



## Model Selection and Tuning

In [39]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder



In [40]:

dt_paramGrid = ParamGridBuilder()\
        .addGrid(dt.maxBins, [40,42])\
        .addGrid(dt.minInstancesPerNode, [10,100]) \
        .build()
dt_paramGrid

[{Param(parent='DecisionTreeClassifier_eb084f5ef648', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 40,
  Param(parent='DecisionTreeClassifier_eb084f5ef648', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 10},
 {Param(parent='DecisionTreeClassifier_eb084f5ef648', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 40,
  Param(parent='DecisionTreeClassifier_eb084f5ef648', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'

=> Ici fait un gridsearch

In [41]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Use BinaryClassificationEvaluator to evaluate our model
evaluatorPR = BinaryClassificationEvaluator(labelCol = "indexed_label", rawPredictionCol = "prediction", metricName = "areaUnderPR")
evaluatorAUC = BinaryClassificationEvaluator(labelCol = "indexed_label", rawPredictionCol = "prediction", metricName = "areaUnderROC")


In [42]:
from pyspark.ml.evaluation import RegressionEvaluator

In [43]:
# Build out the cross validation

#create k folds with k=5.
cv = CrossValidator(estimator=dt, \
                    estimatorParamMaps=dt_paramGrid, \
                    evaluator=evaluatorPR, \
                    numFolds=5, \
                    parallelism=2)


In [44]:
cvModel = cv.fit(train_data)

In [45]:
bestModel = cvModel.bestModel
print(bestModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_eb084f5ef648, depth=0, numNodes=1, numClasses=2, numFeatures=4
  Predict: 0.0



In [46]:
train_pred = cvModel.transform(train_data)
train_pred.show()

+-----------------+-------------+-------------+--------------------+----------+
|         features|indexed_label|rawPrediction|         probability|prediction|
+-----------------+-------------+-------------+--------------------+----------+
|[1.0,1.0,0.0,0.0]|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[1.0,1.0,0.0,1.0]|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[2.0,1.0,0.0,0.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|        (4,[],[])|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[0.0,2.0,1.0,0.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[0.0,2.0,1.0,1.0]|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[2.0,2.0,1.0,1.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|    (4,[0],[1.0])|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[1.0,2.0,1.0,0.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|    (4,[2],[1.0])|          0.0|    [9.

In [47]:
train_pred = bestModel.transform(train_data)
train_pred.show()

+-----------------+-------------+-------------+--------------------+----------+
|         features|indexed_label|rawPrediction|         probability|prediction|
+-----------------+-------------+-------------+--------------------+----------+
|[1.0,1.0,0.0,0.0]|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[1.0,1.0,0.0,1.0]|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[2.0,1.0,0.0,0.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|        (4,[],[])|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[0.0,2.0,1.0,0.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[0.0,2.0,1.0,1.0]|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[2.0,2.0,1.0,1.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|    (4,[0],[1.0])|          1.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|[1.0,2.0,1.0,0.0]|          0.0|    [9.0,5.0]|[0.64285714285714...|       0.0|
|    (4,[2],[1.0])|          0.0|    [9.

#auto

In [48]:
!wget --no-verbose https://nuage.lip6.fr/s/89BG8HD9r3iE693/download/MLData.tgz -O /tmp/MLData.tgz

2023-11-10 18:24:01 URL:https://nuage.lip6.fr/s/89BG8HD9r3iE693/download/MLData.tgz [19397838/19397838] -> "/tmp/MLData.tgz" [1]


In [49]:
!tar -xzvf /tmp/MLData.tgz  --directory /tmp/

MLData/
MLData/._loan.csv
MLData/loan.csv
MLData/autos.csv


In [50]:
!rm  /tmp/MLData.tgz
!rm /tmp/MLData/\._loan.csv
!ls -hal /tmp/MLData

total 73M
drwxr-xr-x 2  501 staff 4.0K Nov 10 18:24 .
drwxrwxrwt 1 root root  4.0K Nov 10 18:24 ..
-rw-r--r-- 1  501 staff  66M Jan  6  2022 autos.csv
-rw-r--r-- 1  501 staff 6.8M Jan  6  2022 loan.csv


In [51]:
dir = "/tmp/MLData/"
data =  spark.read\
            .format("csv").option("header", "true")\
            .option("inferSchema", "true")\
            .load(dir +"autos.csv")

In [52]:
data.count()

371824

In [53]:
sample =data.sample(0.01)
sample.count()

3696

In [54]:
car_data = sample.select("name", "price", "vehicleType", "kilometer", "fuelType")


In [55]:
label = 'price'
features_col = car_data.columns
features_col.remove(label)

In [56]:
categoricalColumns = [col for col, dtype in car_data.dtypes if dtype == 'string'and col!=label]
numericalColumns = [col for col, dtype in car_data.dtypes if dtype != 'string' and col!=label]


In [57]:
def string_index_cols(cols,prefix):
  outCols = map(lambda c:prefix+c, cols)
  return StringIndexer(inputCols=cols,outputCols=list(outCols), handleInvalid="skip")


In [58]:
prefix = 'indexed_'
features_string_indexer = string_index_cols(features_col,prefix)
#df = features_string_indexer.fit(car_data).transform(car_data)
#df.show()

In [59]:
vec_assembler = VectorAssembler(inputCols= features_string_indexer.getOutputCols(), outputCol= 'vector')

vec_indexer = VectorIndexer(inputCol='vector',\
                            outputCol='features',\
                            maxCategories=20)

In [60]:
stages = [features_string_indexer,vec_assembler,vec_indexer]

In [61]:
pipeline = Pipeline(stages = stages)
train_data = pipeline.fit(data).transform(data).select("features",label)
train_data.show()


+--------------------+-----+
|            features|price|
+--------------------+-----+
|[30791.0,5.0,1.0,...|18300|
|[108000.0,6.0,1.0...| 9800|
|[95437.0,1.0,0.0,...| 1500|
|[23483.0,1.0,3.0,...| 3600|
|   (4,[0],[51437.0])|  650|
| [180.0,4.0,0.0,0.0]| 2200|
|[193425.0,0.0,9.0...|    0|
|[84584.0,3.0,10.0...|14500|
|    (4,[0],[2344.0])| 2000|
| [352.0,2.0,0.0,1.0]| 2799|
|[199588.0,2.0,0.0...|  999|
|[134411.0,6.0,5.0...|17999|
|[18341.0,1.0,8.0,...|  450|
|[3413.0,1.0,0.0,0.0]| 1750|
|[4936.0,3.0,0.0,1.0]| 7550|
|[118961.0,3.0,0.0...| 1850|
|[26146.0,5.0,2.0,...|10400|
|   (4,[0],[59386.0])| 3699|
|[23876.0,1.0,0.0,...|  450|
|  (4,[0],[112286.0])|  500|
+--------------------+-----+
only showing top 20 rows



In [62]:

#train_data = train_data.dropna()

In [63]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol="features", labelCol= label, maxDepth=4,maxBins = 233698)
dtModel = dt.fit(train_data)
dtModel

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: ignored

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, Imputer
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Initialize a Spark session
spark = SparkSession.builder.appName("CarPricePrediction").getOrCreate()

# Load and preprocess the data
# Assuming 'car_data' is a DataFrame loaded from a relevant table with car prices
# and it includes both categorical and numerical features, and some of them might have null values.

# Handling categorical features with StringIndexer
label = 'price'
features_col = car_data.columns
features_col.remove(label)

categoricalColumns = [col for col, dtype in car_data.dtypes if dtype == 'string']
indexers = [StringIndexer(inputCol=c, outputCol=c+"_indexed").fit(car_data) for c in features_col]

# Imputing null values in numerical columns
numericalColumns = [col for col, dtype in car_data.dtypes if dtype != 'string' ]
imputer = Imputer(inputCols=numericalColumns, outputCols=numericalColumns)
imputer2 = Imputer(inputCols=categoricalColumns, outputCols=categoricalColumns)

# Assemble vectors
assembler = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numericalColumns, outputCol="features")

# Define the regression model
dt = DecisionTreeRegressor(labelCol="price", featuresCol="features")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=indexers + [imputer, imputer2, assembler, dt])

# Define the parameter grid for model tuning
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .addGrid(dt.maxBins, [32, 64, 128]) \
    .build()

# Define the evaluator for selecting the best model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")

# Configure the cross-validation process
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(car_data)

# Retrieve the best model
bestModel = cvModel.bestModel

# Feature importance from the best DecisionTree model
featureImportance = bestModel.stages[-1].featureImportances

# Stop the Spark session
spark.stop()

# Output feature importance
print("Feature Importance for the Best Model:")
print(featureImportance)