In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.master("local[*]").appName("sparkdev-stringindexer-demo").getOrCreate()

23/04/01 06:02:18 WARN Utils: Your hostname, Pavans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.29.143 instead (on interface en0)
23/04/01 06:02:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/01 06:02:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
# define the structure to the data frame
schema = StructType([
    StructField(name="id", dataType=IntegerType(), nullable=False),
    StructField(name="Technology", dataType=StringType(), nullable=False)
])

df = spark.createDataFrame([(0,"Java"),(1,"NodeJS"),(2,"SpringBoot"),(3,"Java"),(4,"MongoDB"),(5,"SpringBoot")],schema=schema,verifySchema=True)

In [11]:
df.printSchema()
df.show(truncate=False)

root
 |-- id: integer (nullable = false)
 |-- Technology: string (nullable = false)

+---+----------+
|id |Technology|
+---+----------+
|0  |Java      |
|1  |NodeJS    |
|2  |SpringBoot|
|3  |Java      |
|4  |MongoDB   |
|5  |SpringBoot|
+---+----------+



In [12]:
from pyspark.ml.feature import StringIndexer

In [15]:
indexer = StringIndexer(inputCol="Technology", outputCol="TechnologyIndexed", stringOrderType="frequencyAsc")
indexed = indexer.fit(df).transform(df)
indexed.show()

+---+----------+-----------------+
| id|Technology|TechnologyIndexed|
+---+----------+-----------------+
|  0|      Java|              2.0|
|  1|    NodeJS|              1.0|
|  2|SpringBoot|              3.0|
|  3|      Java|              2.0|
|  4|   MongoDB|              0.0|
|  5|SpringBoot|              3.0|
+---+----------+-----------------+



In [17]:
from pyspark.ml.feature import OneHotEncoder

In [19]:
encoder = OneHotEncoder(inputCols=["TechnologyIndexed"], outputCols=["TechnologyVec"], dropLast=False)
encoded = encoder.fit(indexed).transform(indexed)
encoded.show(truncate=False)

+---+----------+-----------------+-------------+
|id |Technology|TechnologyIndexed|TechnologyVec|
+---+----------+-----------------+-------------+
|0  |Java      |2.0              |(4,[2],[1.0])|
|1  |NodeJS    |1.0              |(4,[1],[1.0])|
|2  |SpringBoot|3.0              |(4,[3],[1.0])|
|3  |Java      |2.0              |(4,[2],[1.0])|
|4  |MongoDB   |0.0              |(4,[0],[1.0])|
|5  |SpringBoot|3.0              |(4,[3],[1.0])|
+---+----------+-----------------+-------------+



In [20]:
encoder1 = OneHotEncoder(inputCols=["TechnologyIndexed"], outputCols=["TechnologyVec"])
encoded = encoder1.fit(indexed).transform(indexed)
encoded.show(truncate=False)

+---+----------+-----------------+-------------+
|id |Technology|TechnologyIndexed|TechnologyVec|
+---+----------+-----------------+-------------+
|0  |Java      |2.0              |(3,[2],[1.0])|
|1  |NodeJS    |1.0              |(3,[1],[1.0])|
|2  |SpringBoot|3.0              |(3,[],[])    |
|3  |Java      |2.0              |(3,[2],[1.0])|
|4  |MongoDB   |0.0              |(3,[0],[1.0])|
|5  |SpringBoot|3.0              |(3,[],[])    |
+---+----------+-----------------+-------------+



In [21]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [24]:
# Sample data
data = [(0, 1.2, 4.3, 5.4), (1, 2.2, 3.1, 6.5), (2, 0.9, 2.3, 5.6), (3, 1.5, 4.1, 7.2)]

# Create a dataframe
df = spark.createDataFrame(data, ["id", "feat1", "feat2", "feat3"])
df.show()
# Create a VectorAssembler
assembler = VectorAssembler(inputCols=["feat1", "feat2", "feat3"], outputCol="features")

# Transform the data
output = assembler.transform(df)

# Select the relevant columns and show the output
output.select(col("id"), col("features")).show()

+---+-----+-----+-----+
| id|feat1|feat2|feat3|
+---+-----+-----+-----+
|  0|  1.2|  4.3|  5.4|
|  1|  2.2|  3.1|  6.5|
|  2|  0.9|  2.3|  5.6|
|  3|  1.5|  4.1|  7.2|
+---+-----+-----+-----+

+---+-------------+
| id|     features|
+---+-------------+
|  0|[1.2,4.3,5.4]|
|  1|[2.2,3.1,6.5]|
|  2|[0.9,2.3,5.6]|
|  3|[1.5,4.1,7.2]|
+---+-------------+

