In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType
change = 2

In [32]:
spark = SparkSession.builder.appName('pyspark-training-spark-ml').master('local[2]').getOrCreate()

In [33]:
columns=["ID", "Title", "ReleaseDate", "VideoReleaseDate", "IMDB", "Unknown", "Action", "Adventure",
         "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir",
         "Horror", "Musical", "Mystery", "Romance", "SciFi", "Thriller", "War", "Western"]

In [34]:
schema = StructType([StructField(col, StringType(), True) for col in columns])

In [55]:
df = spark.read.schema(schema).option('delimiter', '|').csv('D:\\data\\movies')\
     .drop('IMDB', 'VideoReleaseDate', 'ReleaseDate', 'ID').dropDuplicates()

In [53]:
df.count()

1682

In [38]:
df.summary().show()

+-------+--------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+------------------+-------------------+-------------------+-------------------+-------------------+
|summary|               Title|             Unknown|             Action|          Adventure|           Animation|           Children|             Comedy|              Crime|         Documentary|              Drama|             Fantasy|            FilmNoir|              Horror|            Musical|             Mystery|           Romance|              SciFi|           Thriller|                War|            Western|
+-------+--------------------+--------------------+-------------------+-------------------+--------------------+-------------------+-------------------+--------------

### Analyze data

In [56]:
for col in df.columns:
    print(col, df.select(col).distinct().count())

Title 1664
Unknown 2
Action 2
Adventure 2
Animation 2
Children 2
Comedy 2
Crime 2
Documentary 2
Drama 2
Fantasy 2
FilmNoir 2
Horror 2
Musical 2
Mystery 2
Romance 2
SciFi 2
Thriller 2
War 2
Western 2


### Feature engineering

In [41]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#### assembling all the features using a vector assembler

In [57]:
categorical_columns = df.drop('Title').columns

In [58]:
stages = []
indexers = [
     StringIndexer(inputCol=c, outputCol='{0}_indexed'.format(c))
     for c in categorical_columns
   ]
stages += indexers

In [62]:
assembler_input_cols = [indexer.getOutputCol() for indexer in indexers]
assembler = VectorAssembler(inputCols=assembler_input_cols, outputCol="features")
stages += [assembler]

#### creating a pipeline to perform all the stages

In [63]:
pipeline = Pipeline().setStages(stages)
assembled = pipeline.fit(df)
df_assembled = assembled.transform(df)

In [64]:
df_vectorized = df_assembled.select('Title', 'features')

In [65]:
df_train, df_test = df_vectorized.randomSplit([0.7, 0.3], seed=42)

#### Kmeans

In [66]:
from pyspark.ml.clustering import KMeans

In [74]:
kmeans = KMeans(featuresCol="features", k=10, seed=1)

In [75]:
model = kmeans.fit(df_train)

In [81]:
df_train_pred = model.transform(df_train)

### Evaluate model

In [79]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [82]:
evaluator = ClusteringEvaluator()

In [83]:
silhouette = evaluator.evaluate(df_train_pred)

In [84]:
silhouette

0.40649236618016893

In [85]:
kmeans_pred = model.transform(df_test)

In [86]:
kmeans_pred.show()

+--------------------+--------------------+----------+
|               Title|            features|prediction|
+--------------------+--------------------+----------+
|101 Dalmatians (1...|(19,[4,5],[1.0,1.0])|         1|
|20,000 Leagues Un...|(19,[2,4,9,15],[1...|         5|
|3 Ninjas: High No...|(19,[1,4],[1.0,1.0])|         9|
|39 Steps, The (1935)|     (19,[16],[1.0])|         3|
|A Chef in Love (1...|      (19,[5],[1.0])|         1|
|Above the Rim (1994)|      (19,[8],[1.0])|         0|
|Absolute Power (1...|(19,[13,16],[1.0,...|         3|
|Across the Sea of...|      (19,[7],[1.0])|         0|
|Addicted to Love ...|(19,[5,14],[1.0,1...|         4|
|Adventures of Pin...|(19,[2,4],[1.0,1.0])|         5|
|Adventures of Pri...|(19,[5,8],[1.0,1.0])|         0|
|    Afterglow (1997)|(19,[8,14],[1.0,1...|         6|
|Age of Innocence,...|      (19,[8],[1.0])|         0|
|Aiqing wansui (1994)|      (19,[8],[1.0])|         0|
|Air Force One (1997)|(19,[1,16],[1.0,1...|         9|
|     Airh

In [87]:
evaluator.evaluate(kmeans_pred)

0.3862688319253551