In [1]:
import findspark
findspark.init()

In [2]:
! ls ~/Sync/datasets/moviedata/moviedata-lite

comments.csv  movies.csv  person.csv  ratings.csv  users.csv


In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = (SparkSession
    .builder
    .appName("StringIndexer Example")
    .getOrCreate())

In [5]:
df = spark.read.csv('/home/eric/Sync/datasets/moviedata/moviedata-lite/ratings.csv', header=True)

In [6]:
df.printSchema()

root
 |-- RATING_ID: string (nullable = true)
 |-- USER_MD5: string (nullable = true)
 |-- MOVIE_ID: string (nullable = true)
 |-- RATING: string (nullable = true)
 |-- RATING_TIME: string (nullable = true)



In [7]:
df.show(5)

+----------+--------------------+--------+------+-------------------+
| RATING_ID|            USER_MD5|MOVIE_ID|RATING|        RATING_TIME|
+----------+--------------------+--------+------+-------------------+
|1359352573|0ab7e3efacd56983f...| 5113101|     2|2018-09-05 19:42:07|
|1598245094|84dfd3f91dd85ea10...| 5113101|     1|2019-07-09 14:52:07|
| 311937819|c9a47fd59b55967ce...| 3718526|     3|2010-11-05 22:15:44|
| 457663846|18cbf971bdf173360...| 3718526|     4|2011-11-14 22:31:02|
| 313277849|47e69de0d68e6a4db...| 3718526|     4|2010-11-09 12:41:11|
+----------+--------------------+--------+------+-------------------+
only showing top 5 rows



In [8]:
from pyspark.ml.feature import StringIndexer

In [9]:
stringIndexer = StringIndexer(inputCol="USER_MD5", outputCol="USER_ID")
model = stringIndexer.fit(df)
indexed = model.transform(df)
indexed.show(5)

+----------+--------------------+--------+------+-------------------+-------+
| RATING_ID|            USER_MD5|MOVIE_ID|RATING|        RATING_TIME|USER_ID|
+----------+--------------------+--------+------+-------------------+-------+
|1359352573|0ab7e3efacd56983f...| 5113101|     2|2018-09-05 19:42:07|   74.0|
|1598245094|84dfd3f91dd85ea10...| 5113101|     1|2019-07-09 14:52:07|  949.0|
| 311937819|c9a47fd59b55967ce...| 3718526|     3|2010-11-05 22:15:44|  498.0|
| 457663846|18cbf971bdf173360...| 3718526|     4|2011-11-14 22:31:02|  866.0|
| 313277849|47e69de0d68e6a4db...| 3718526|     4|2010-11-09 12:41:11|  426.0|
+----------+--------------------+--------+------+-------------------+-------+
only showing top 5 rows



In [10]:
indexed_1 = indexed.select("USER_ID", "MOVIE_ID", "RATING")

In [11]:
from pyspark.ml.feature import IndexToString

In [12]:
converter = IndexToString(inputCol="USER_ID", outputCol="USER_MD5_ORIGINAL")
converted = converter.transform(indexed_1)
converted.show(5)

+-------+--------+------+--------------------+
|USER_ID|MOVIE_ID|RATING|   USER_MD5_ORIGINAL|
+-------+--------+------+--------------------+
|   74.0| 5113101|     2|0ab7e3efacd56983f...|
|  949.0| 5113101|     1|84dfd3f91dd85ea10...|
|  498.0| 3718526|     3|c9a47fd59b55967ce...|
|  866.0| 3718526|     4|18cbf971bdf173360...|
|  426.0| 3718526|     4|47e69de0d68e6a4db...|
+-------+--------+------+--------------------+
only showing top 5 rows



In [13]:
converted.select("USER_MD5_ORIGINAL", "MOVIE_ID", "RATING").withColumnRenamed("USER_MD5_ORIGINAL", "USER_MD5").show(5)

+--------------------+--------+------+
|            USER_MD5|MOVIE_ID|RATING|
+--------------------+--------+------+
|0ab7e3efacd56983f...| 5113101|     2|
|84dfd3f91dd85ea10...| 5113101|     1|
|c9a47fd59b55967ce...| 3718526|     3|
|18cbf971bdf173360...| 3718526|     4|
|47e69de0d68e6a4db...| 3718526|     4|
+--------------------+--------+------+
only showing top 5 rows



In [14]:
spark.stop()