In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('One Hot Encoding')\
                    .getOrCreate()

In [3]:
spark

In [4]:
sparkDataFrame = spark.read.csv('/content/AdultIncome.csv', header=True, inferSchema=True)

In [5]:
sparkDataFrame.show()

+---+-------+------------+-------------+-------------+------+-----------+------+
|Age|IsAdult|         Job|Qualification|MaritalStatus|Gender|BankBalance|Salary|
+---+-------+------------+-------------+-------------+------+-----------+------+
| 39|      1|     GovtJob|     Bachelor|    UnMarried|  Male|      77516| <=50K|
| 50|      1|SelfEmployed|     Bachelor|      Married|  Male|      83311| <=50K|
| 38|      1|  PrivateJob|     Bachelor|     Divorced|  Male|     215646| <=50K|
| 53|      1|  PrivateJob|       School|      Married|  Male|     234721| <=50K|
| 28|      1|  PrivateJob|     Bachelor|      Married|Female|     338409| <=50K|
| 37|      1|  PrivateJob|      Masters|      Married|Female|     284582| <=50K|
| 49|      1|  PrivateJob|       School|      Married|Female|     160187| <=50K|
| 52|      1|SelfEmployed|     Bachelor|      Married|  Male|     209642|  >50K|
| 31|      1|  PrivateJob|      Masters|    UnMarried|Female|      45781|  >50K|
| 42|      1|  PrivateJob|  

In [6]:
sparkDataFrame.columns

['Age',
 'IsAdult',
 'Job',
 'Qualification',
 'MaritalStatus',
 'Gender',
 'BankBalance',
 'Salary']

In [7]:
sparkDataFrame.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- IsAdult: integer (nullable = true)
 |-- Job: string (nullable = true)
 |-- Qualification: string (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- BankBalance: integer (nullable = true)
 |-- Salary: string (nullable = true)



In [8]:
print((sparkDataFrame.count(),len(sparkDataFrame.columns)))

(32561, 8)


In [9]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [10]:
stringIndexer = StringIndexer(inputCols=['Job','Qualification','MaritalStatus','Gender'],
                              outputCols=['IJob', 'IQualification', 'IMaritalStatus', 'IGender'])

In [11]:
sparkDataFrameFitted = stringIndexer.fit(sparkDataFrame)

In [14]:
sparkDataFrame = sparkDataFrameFitted.transform(sparkDataFrame)

In [15]:
sparkDataFrame.show(5)

+---+-------+------------+-------------+-------------+------+-----------+------+----+--------------+--------------+-------+
|Age|IsAdult|         Job|Qualification|MaritalStatus|Gender|BankBalance|Salary|IJob|IQualification|IMaritalStatus|IGender|
+---+-------+------------+-------------+-------------+------+-----------+------+----+--------------+--------------+-------+
| 39|      1|     GovtJob|     Bachelor|    UnMarried|  Male|      77516| <=50K| 2.0|           0.0|           1.0|    0.0|
| 50|      1|SelfEmployed|     Bachelor|      Married|  Male|      83311| <=50K| 1.0|           0.0|           0.0|    0.0|
| 38|      1|  PrivateJob|     Bachelor|     Divorced|  Male|     215646| <=50K| 0.0|           0.0|           2.0|    0.0|
| 53|      1|  PrivateJob|       School|      Married|  Male|     234721| <=50K| 0.0|           1.0|           0.0|    0.0|
| 28|      1|  PrivateJob|     Bachelor|      Married|Female|     338409| <=50K| 0.0|           0.0|           0.0|    1.0|
+---+---

In [16]:
oneHotEncoder = OneHotEncoder(inputCols=['IsAdult',
 'IJob',
 'IQualification',
 'IMaritalStatus',
 'IGender'], outputCols=['DIsAdult',
 'DIJob',
 'DIQualification',
 'DIMaritalStatus',
 'DIGender'])

In [17]:
oneHotEncoderFitted = oneHotEncoder.fit(sparkDataFrame)

In [18]:
sparkDataFrame = oneHotEncoderFitted.transform(sparkDataFrame)

In [19]:
sparkDataFrame.show(10)

+---+-------+------------+-------------+-------------+------+-----------+------+----+--------------+--------------+-------+---------+-------------+---------------+---------------+-------------+
|Age|IsAdult|         Job|Qualification|MaritalStatus|Gender|BankBalance|Salary|IJob|IQualification|IMaritalStatus|IGender| DIsAdult|        DIJob|DIQualification|DIMaritalStatus|     DIGender|
+---+-------+------------+-------------+-------------+------+-----------+------+----+--------------+--------------+-------+---------+-------------+---------------+---------------+-------------+
| 39|      1|     GovtJob|     Bachelor|    UnMarried|  Male|      77516| <=50K| 2.0|           0.0|           1.0|    0.0|(1,[],[])|    (2,[],[])|  (3,[0],[1.0])|  (3,[1],[1.0])|(1,[0],[1.0])|
| 50|      1|SelfEmployed|     Bachelor|      Married|  Male|      83311| <=50K| 1.0|           0.0|           0.0|    0.0|(1,[],[])|(2,[1],[1.0])|  (3,[0],[1.0])|  (3,[0],[1.0])|(1,[0],[1.0])|
| 38|      1|  PrivateJob|    

In [20]:
spark.stop()