Update III. Getting started with machine learning pipelines.py

ortizfram · web-flow · commit 0130f769a7e1 · 2022-11-14T11:30:48.000-03:00
diff --git a/Introduction to PySpark/III. Getting started with machine learning pipelines.py b/Introduction to PySpark/III. Getting started with machine learning pipelines.py
@@ -74,8 +74,10 @@
    > pyspark.ml.features submodule
     'one-hot vectors'      -> all elements are zero except for at most one element, which has a value of one (1).
     
-    - create a 'StringIndexer'
-    - encode w/ 'OneHotEncoder'
+    1 > create a 'StringIndexer'.
+            carr_indexer = StringIndexer(inputCol='carrier',outputCol='carrier_index')
+    2 > encode w/ 'OneHotEncoder'.
+            carr_encoder = OneHotEncoder(inputCol='carrier_index',outputCol='carrier_fact')
     - 'Pipeline' will take care of the rest.
 """
 #|
@@ -86,3 +88,11 @@
 #|
 #|
 ### Carrier
+# Create a StringIndexer
+carr_indexer = StringIndexer(imputCol='carrier',outputCol='carrier_index')
+
+# Create a OneHotEncoder
+carr_encoder = OneHotEncoder(imputCol='carrier_index',outputCol='carrier_fact')
+#|
+#|
+### Destination