Update III. Getting started with machine learning pipelines.py

ortizfram · web-flow · commit fe44b50edcb8 · 2022-11-14T11:52:24.000-03:00
diff --git a/Introduction to PySpark/III. Getting started with machine learning pipelines.py b/Introduction to PySpark/III. Getting started with machine learning pipelines.py
@@ -79,6 +79,10 @@
     2 > encode w/ 'OneHotEncoder'.
             carr_encoder = OneHotEncoder(inputCol='carrier_index',outputCol='carrier_fact')
     - 'Pipeline' will take care of the rest.
+    -----------------------
+      > 'VectorAssembler'  -> combine all of the columns containing our features into a single column
+                          inputCol= ['column_name1','c2','c3']
+                          outputCol= 'features'
 """
 #|
 #|
@@ -104,3 +108,8 @@
 #|
 #|
 ### Assemble a vector
+# Make a VectorAssembler
+vec_assembler = VectorAssembler(inputCols=["month", "air_time", "carrier_fact", "dest_fact", "plane_age"], outputCol='features')
+#|
+#|
+###