In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Ordinal Encoding PySpark")\
                    .getOrCreate()

In [4]:
spark

In [9]:
sparkDataframe = spark.read.csv('/content/AdultIncome.csv', header=True, inferSchema=True)

In [10]:
sparkDataframe.show()

+---+-------+------------+-------------+-------------+------+-----------+------+
|Age|IsAdult|         Job|Qualification|MaritalStatus|Gender|BankBalance|Salary|
+---+-------+------------+-------------+-------------+------+-----------+------+
| 39|      1|     GovtJob|     Bachelor|    UnMarried|  Male|      77516| <=50K|
| 50|      1|SelfEmployed|     Bachelor|      Married|  Male|      83311| <=50K|
| 38|      1|  PrivateJob|     Bachelor|     Divorced|  Male|     215646| <=50K|
| 53|      1|  PrivateJob|       School|      Married|  Male|     234721| <=50K|
| 28|      1|  PrivateJob|     Bachelor|      Married|Female|     338409| <=50K|
| 37|      1|  PrivateJob|      Masters|      Married|Female|     284582| <=50K|
| 49|      1|  PrivateJob|       School|      Married|Female|     160187| <=50K|
| 52|      1|SelfEmployed|     Bachelor|      Married|  Male|     209642|  >50K|
| 31|      1|  PrivateJob|      Masters|    UnMarried|Female|      45781|  >50K|
| 42|      1|  PrivateJob|  

In [11]:
sparkDataframe.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- IsAdult: integer (nullable = true)
 |-- Job: string (nullable = true)
 |-- Qualification: string (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- BankBalance: integer (nullable = true)
 |-- Salary: string (nullable = true)



In [12]:
sparkDataframe.columns

['Age',
 'IsAdult',
 'Job',
 'Qualification',
 'MaritalStatus',
 'Gender',
 'BankBalance',
 'Salary']

In [13]:
from pyspark.ml.feature import VectorAssembler

In [14]:
featureAssember = VectorAssembler(inputCols=['Age', 'BankBalance'], outputCol='numFeature')

In [15]:
sparkDataframe = featureAssember.transform(sparkDataframe)

In [18]:
sparkDataframe.show(5)

+---+-------+------------+-------------+-------------+------+-----------+------+---------------+
|Age|IsAdult|         Job|Qualification|MaritalStatus|Gender|BankBalance|Salary|     numFeature|
+---+-------+------------+-------------+-------------+------+-----------+------+---------------+
| 39|      1|     GovtJob|     Bachelor|    UnMarried|  Male|      77516| <=50K| [39.0,77516.0]|
| 50|      1|SelfEmployed|     Bachelor|      Married|  Male|      83311| <=50K| [50.0,83311.0]|
| 38|      1|  PrivateJob|     Bachelor|     Divorced|  Male|     215646| <=50K|[38.0,215646.0]|
| 53|      1|  PrivateJob|       School|      Married|  Male|     234721| <=50K|[53.0,234721.0]|
| 28|      1|  PrivateJob|     Bachelor|      Married|Female|     338409| <=50K|[28.0,338409.0]|
+---+-------+------------+-------------+-------------+------+-----------+------+---------------+
only showing top 5 rows



In [19]:
from pyspark.ml.feature import StandardScaler

In [20]:
standardScaler = StandardScaler(inputCol='numFeature', outputCol='scaledFeature')

In [21]:
sparkDataframe = standardScaler.fit(sparkDataframe).transform(sparkDataframe)

In [22]:
sparkDataframe.show(5)

+---+-------+------------+-------------+-------------+------+-----------+------+---------------+--------------------+
|Age|IsAdult|         Job|Qualification|MaritalStatus|Gender|BankBalance|Salary|     numFeature|       scaledFeature|
+---+-------+------------+-------------+-------------+------+-----------+------+---------------+--------------------+
| 39|      1|     GovtJob|     Bachelor|    UnMarried|  Male|      77516| <=50K| [39.0,77516.0]|[2.85914686699289...|
| 50|      1|SelfEmployed|     Bachelor|      Married|  Male|      83311| <=50K| [50.0,83311.0]|[3.66557290640114...|
| 38|      1|  PrivateJob|     Bachelor|     Divorced|  Male|     215646| <=50K|[38.0,215646.0]|[2.78583540886487...|
| 53|      1|  PrivateJob|       School|      Married|  Male|     234721| <=50K|[53.0,234721.0]|[3.88550728078521...|
| 28|      1|  PrivateJob|     Bachelor|      Married|Female|     338409| <=50K|[28.0,338409.0]|[2.05272082758464...|
+---+-------+------------+-------------+-------------+--

In [24]:
sparkDataframe.select('scaledFeature').take(1)

[Row(scaledFeature=DenseVector([2.8591, 0.7344]))]

In [25]:
sparkDataframe.select('scaledFeature').show(truncate=False)

+----------------------------------------+
|scaledFeature                           |
+----------------------------------------+
|[2.8591468669928934,0.7344009131153691] |
|[3.6655729064011453,0.7893038143422586] |
|[2.7858354088648705,2.043070066949751]  |
|[3.8855072807852142,2.223790143033084]  |
|[2.0527208275846416,3.2061494221381257] |
|[2.7125239507368475,2.696182474020821]  |
|[3.5922614482731223,1.5176412491512932] |
|[3.8121958226571913,1.9861870610884491] |
|[2.27265520196871,0.43373765678485365]  |
|[3.079081241376962,1.510649300729301]   |
|[2.7125239507368475,2.6571677807934995] |
|[2.199343743840687,1.3386738972658847]  |
|[1.686163536944527,1.1584275304252336]  |
|[2.345966660096733,1.942387904510035]   |
|[2.9324583251209164,1.1536904379984099] |
|[2.492589576352779,2.3257892171674572]  |
|[1.8327864532005727,1.674619017991385]  |
|[2.345966660096733,1.7700051110979118]  |
|[2.7858354088648705,0.27368077786732636]|
|[3.152392699504985,2.7681199596145696]  |
+----------

In [26]:
spark.stop()