In [172]:
from pyspark.ml.feature import Binarizer
from pyspark.ml import Transformer

from pyspark.ml.feature import Tokenizer, RegexTokenizer

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.linalg import Vectors

from pyspark.ml.functions import vector_to_array
import pyspark.sql.functions as f
from pyspark.ml.feature import VectorAssembler

from pyspark.ml import Pipeline

In [173]:

import findspark
from pyspark.sql import SparkSession

In [174]:
findspark.init()
spark = SparkSession\
        .builder\
        .master("local[2]")\
        .appName("Spark MLlib")\
        .getOrCreate()
spark.sparkContext

In [175]:
df = spark.read.csv("Otodom_Flat_Listings.csv", header=True, inferSchema=True)
df.show()

+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+-------------------+
|               Title|    Price|            Location|Surface|Number_of_Rooms|   Floor|Finishing_Condition| Heating|       Parking_Space|Balcony_Garden_Terrace|                Link|       Voivodeship|               City|
+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+-------------------+
|2 pokoje 47m2 po ...| 415000.0|ul. Marysińska, S...|   47.0|             2 |    NULL|    do zamieszkania|miejskie|garaż/miejsce par...|                balkon|https://www.otodo...|           Łódzkie|               Łódź|
|Właściciel- Ludwi...|2499000.0|ul. Ludwiki, Czys...|  105.0|             4 |     2/8|     do wykończenia|miejskie|garaż

In [176]:
cols = ['Floor', 'Finishing_Condition', 'Heating', 'Parking_Space', 'Balcony_Garden_Terrace', 'Voivodeship', 'City']

colsn = [col + "_index" for col in cols]
hotcols = [col + "_hot" for col in cols]

In [177]:
indexer = StringIndexer(inputCols=cols, outputCols=colsn,  handleInvalid="keep")
indexer_fitted = indexer.fit(df)
df_indexed = indexer_fitted.transform(df)



In [178]:
df_indexed.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Surface: string (nullable = true)
 |-- Number_of_Rooms: string (nullable = true)
 |-- Floor: string (nullable = true)
 |-- Finishing_Condition: string (nullable = true)
 |-- Heating: string (nullable = true)
 |-- Parking_Space: string (nullable = true)
 |-- Balcony_Garden_Terrace: string (nullable = true)
 |-- Link: string (nullable = true)
 |-- Voivodeship: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Floor_index: double (nullable = false)
 |-- Finishing_Condition_index: double (nullable = false)
 |-- Heating_index: double (nullable = false)
 |-- Parking_Space_index: double (nullable = false)
 |-- Balcony_Garden_Terrace_index: double (nullable = false)
 |-- Voivodeship_index: double (nullable = false)
 |-- City_index: double (nullable = false)



In [179]:
encoder = OneHotEncoder(inputCols=colsn, outputCols=hotcols)
df_onehot = encoder.fit(df_indexed).transform(df_indexed)
df_onehot.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Surface: string (nullable = true)
 |-- Number_of_Rooms: string (nullable = true)
 |-- Floor: string (nullable = true)
 |-- Finishing_Condition: string (nullable = true)
 |-- Heating: string (nullable = true)
 |-- Parking_Space: string (nullable = true)
 |-- Balcony_Garden_Terrace: string (nullable = true)
 |-- Link: string (nullable = true)
 |-- Voivodeship: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Floor_index: double (nullable = false)
 |-- Finishing_Condition_index: double (nullable = false)
 |-- Heating_index: double (nullable = false)
 |-- Parking_Space_index: double (nullable = false)
 |-- Balcony_Garden_Terrace_index: double (nullable = false)
 |-- Voivodeship_index: double (nullable = false)
 |-- City_index: double (nullable = false)
 |-- Floor_hot: vector (nullable = true)
 |-- Finishing_Condition_hot: vector (nullable = true)
 |--

In [180]:
newcols=['col' + col for col in cols]

In [181]:
newcols

['colFloor',
 'colFinishing_Condition',
 'colHeating',
 'colParking_Space',
 'colBalcony_Garden_Terrace',
 'colVoivodeship',
 'colCity']

In [182]:
for x in range(len(cols)):
 df_col_onehot = df_col_onehot.withColumn(newcols[x], vector_to_array(hotcols[x]))

In [183]:
df_col_onehot.show()

+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+-------------------+-----------+-------------------------+-------------+-------------------+----------------------------+-----------------+----------+----------------+-----------------------+-------------+-----------------+--------------------------+---------------+-----------------+--------------------+--------------------+----------------------+--------------------+----------------+-------------------------+--------------------+
|               Title|    Price|            Location|Surface|Number_of_Rooms|   Floor|Finishing_Condition| Heating|       Parking_Space|Balcony_Garden_Terrace|                Link|       Voivodeship|               City|Floor_index|Finishing_Condition_index|Heating_index|Parking_Space_index|Balcony_Garden_Terrace_index|Voivodeship_index|City_index|       Floor_hot|

In [184]:
for x in range(len(cols)):
  col_index = cols.index(cols[x])
  labels = indexer_fitted.labelsArray[col_index]
  num_categories_current_col = len(labels)
  cols_expanded = [(f.col('col'+cols[x])[i].alias(f'{labels[i]}')) for i in range(num_categories_current_col)]
  df_cols_onehot = df_col_onehot.select('*', *cols_expanded)

In [185]:
df_cols_onehot.show()

+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+-------------------+-----------+-------------------------+-------------+-------------------+----------------------------+-----------------+----------+----------------+-----------------------+-------------+-----------------+--------------------------+---------------+-----------------+--------------------+--------------------+----------------------+--------------------+----------------+-------------------------+--------------------+--------+------+-------+------+------+----+---------+--------+--------+------+------+-----+-----------+------+---------+----------+---------+-----------+-------+--------+-------+---------+------+-----+-----------+----------+------------+------------+-----------+-----+-------+-------+-----+--------+-----------+-------------+---------+------+--------+-----+-------+--

In [186]:
df_final = df_cols_onehot.select("Price", "Surface", "Marketing Spend", "California", "New York", "profit")

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `R&D Spend` cannot be resolved. Did you mean one of the following? [`Chełm`, `Gliwice`, `Heating`, `Jaworzno`, `Kielce`].;
'Project ['R&D Spend, 'Administration, 'Marketing Spend, 'California, 'New York, 'profit]
+- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 246 more fields]
   +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 10 more fields]
      +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 10 more fields]
         +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 10 more fields]
            +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 10 more fields]
               +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 10 more fields]
                  +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 10 more fields]
                     +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 10 more fields]
                        +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 10 more fields]
                           +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 10 more fields]
                              +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 9 more fields]
                                 +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 8 more fields]
                                    +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 7 more fields]
                                       +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 6 more fields]
                                          +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 5 more fields]
                                             +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, Floor_hot#9720, Finishing_Condition_hot#9721, Heating_hot#9722, Parking_Space_hot#9723, ... 4 more fields]
                                                +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, Floor_index#9644, Finishing_Condition_index#9645, Heating_index#9646, Parking_Space_index#9647, Balcony_Garden_Terrace_index#9648, Voivodeship_index#9649, City_index#9650, UDF(cast(Floor_index#9644 as double), 0) AS Floor_hot#9720, UDF(cast(Finishing_Condition_index#9645 as double), 1) AS Finishing_Condition_hot#9721, UDF(cast(Heating_index#9646 as double), 2) AS Heating_hot#9722, UDF(cast(Parking_Space_index#9647 as double), 3) AS Parking_Space_hot#9723, ... 3 more fields]
                                                   +- Project [Title#9479, Price#9480, Location#9481, Surface#9482, Number_of_Rooms#9483, Floor#9484, Finishing_Condition#9485, Heating#9486, Parking_Space#9487, Balcony_Garden_Terrace#9488, Link#9489, Voivodeship#9490, City#9491, UDF(cast(Floor#9484 as string)) AS Floor_index#9644, UDF(cast(Finishing_Condition#9485 as string)) AS Finishing_Condition_index#9645, UDF(cast(Heating#9486 as string)) AS Heating_index#9646, UDF(cast(Parking_Space#9487 as string)) AS Parking_Space_index#9647, UDF(cast(Balcony_Garden_Terrace#9488 as string)) AS Balcony_Garden_Terrace_index#9648, UDF(cast(Voivodeship#9490 as string)) AS Voivodeship_index#9649, UDF(cast(City#9491 as string)) AS City_index#9650]
                                                      +- Relation [Title#9479,Price#9480,Location#9481,Surface#9482,Number_of_Rooms#9483,Floor#9484,Finishing_Condition#9485,Heating#9486,Parking_Space#9487,Balcony_Garden_Terrace#9488,Link#9489,Voivodeship#9490,City#9491] csv
