<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#StringIndexer-for-multiple-columns" data-toc-modified-id="StringIndexer-for-multiple-columns-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>StringIndexer for multiple columns</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('example').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
sc.setLogLevel("INFO")

[('numpy', '1.17.5'), ('pandas', '1.0.5'), ('pyspark', '3.0.0')]


In [2]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

from pyspark.ml.regression import LinearRegression

# StringIndexer for multiple columns
https://stackoverflow.com/questions/36942233/apply-stringindexer-to-several-columns-in-a-pyspark-dataframe

In [3]:
df = pd.DataFrame({'address': [1111111, 1111111, 1111111, 1111112, 1111113,
                               1111111, 1111114, 1111115, 2111115],
          'date': [20151122045510, 20151122045501, 20151122045500,
                   20151122065832, 20160101003221, 20160703045231,
                   20150419134543, 20151123174302, 20123192],
          'name': ['Yin', 'Yin', 'Yln', 'Yun', 'Yan', 'Yin', 'Yin', 'Yen', 'Yen'],
          'food': ['gre', 'gre', 'gra', 'ddd', 'fdf', 'gre', 'fdf', 'ddd', 'gre']})
df

Unnamed: 0,address,date,name,food
0,1111111,20151122045510,Yin,gre
1,1111111,20151122045501,Yin,gre
2,1111111,20151122045500,Yln,gra
3,1111112,20151122065832,Yun,ddd
4,1111113,20160101003221,Yan,fdf
5,1111111,20160703045231,Yin,gre
6,1111114,20150419134543,Yin,fdf
7,1111115,20151123174302,Yen,ddd
8,2111115,20123192,Yen,gre


In [4]:
df = sqlContext.createDataFrame(df)
df.show()

+-------+--------------+----+----+
|address|          date|name|food|
+-------+--------------+----+----+
|1111111|20151122045510| Yin| gre|
|1111111|20151122045501| Yin| gre|
|1111111|20151122045500| Yln| gra|
|1111112|20151122065832| Yun| ddd|
|1111113|20160101003221| Yan| fdf|
|1111111|20160703045231| Yin| gre|
|1111114|20150419134543| Yin| fdf|
|1111115|20151123174302| Yen| ddd|
|2111115|      20123192| Yen| gre|
+-------+--------------+----+----+



In [5]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df)
            for column in list(set(df.columns)-set(['date'])) ]


pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(df).transform(df)

df_r.show()

+-------+--------------+----+----+----------+-------------+----------+
|address|          date|name|food|food_index|address_index|name_index|
+-------+--------------+----+----+----------+-------------+----------+
|1111111|20151122045510| Yin| gre|       0.0|          0.0|       0.0|
|1111111|20151122045501| Yin| gre|       0.0|          0.0|       0.0|
|1111111|20151122045500| Yln| gra|       3.0|          0.0|       3.0|
|1111112|20151122065832| Yun| ddd|       1.0|          1.0|       4.0|
|1111113|20160101003221| Yan| fdf|       2.0|          2.0|       2.0|
|1111111|20160703045231| Yin| gre|       0.0|          0.0|       0.0|
|1111114|20150419134543| Yin| fdf|       2.0|          3.0|       0.0|
|1111115|20151123174302| Yen| ddd|       1.0|          4.0|       1.0|
|2111115|      20123192| Yen| gre|       0.0|          5.0|       1.0|
+-------+--------------+----+----+----------+-------------+----------+

