# Kmapper Test Notebook
Выполнил: Игошин Андрей
Группа: 18 МАГ ИАД

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler

In [2]:
import sparkmonitor
!jupyter nbextension install sparkmonitor --py --user --symlink 
!jupyter nbextension enable sparkmonitor --py --user            
!jupyter serverextension enable --py --user sparkmonitor
!ipython profile create && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >>  $(ipython profile locate default)/ipython_kernel_config.py

Installing /anaconda3/lib/python3.6/site-packages/sparkmonitor/static -> sparkmonitor
- Validating: [32mOK[0m

    To initialize this nbextension in the browser every time the notebook (or other app) loads:
    
          jupyter nbextension enable sparkmonitor --user --py
    
Enabling notebook extension sparkmonitor/module...
      - Validating: [32mOK[0m
Enabling: sparkmonitor.serverextension
- Writing config: /Users/a.y.igoshin/.jupyter
    - Validating...
      sparkmonitor.serverextension  [32mOK[0m


In [3]:
from pyspark import SparkContext
sc=SparkContext.getOrCreate(conf=conf) #Start the spark context

In [4]:
spark = SparkSession.builder.appName('MovieRatingsProject').getOrCreate()

In [5]:
info = spark.read.csv('ml-100k/u.info',inferSchema=True,sep='\t')
info.show()

+--------------+
|           _c0|
+--------------+
|     943 users|
|    1682 items|
|100000 ratings|
+--------------+



In [6]:
from pyspark.sql.functions import from_unixtime, year

Датасет содержит 100 000 оценок 934 пользователями 1682 фильмов. Каждый пользователь оценил не менее 20 фильмов.

In [7]:
ratings = spark.read.csv('ml-100k/u.data',inferSchema=True,sep='\t')
ratings = ratings.withColumnRenamed('_c0','user_id').withColumnRenamed('_c1','movie_id').withColumnRenamed('_c2','rating').withColumnRenamed('_c3','timestamp')
# ratings.show()

In [8]:
ratings = ratings.withColumn('year_rated', year(from_unixtime('timestamp')))

In [9]:
ratings.show()

+-------+--------+------+---------+----------+
|user_id|movie_id|rating|timestamp|year_rated|
+-------+--------+------+---------+----------+
|    196|     242|     3|881250949|      1997|
|    186|     302|     3|891717742|      1998|
|     22|     377|     1|878887116|      1997|
|    244|      51|     2|880606923|      1997|
|    166|     346|     1|886397596|      1998|
|    298|     474|     4|884182806|      1998|
|    115|     265|     2|881171488|      1997|
|    253|     465|     5|891628467|      1998|
|    305|     451|     3|886324817|      1998|
|      6|      86|     3|883603013|      1998|
|     62|     257|     2|879372434|      1997|
|    286|    1014|     5|879781125|      1997|
|    200|     222|     5|876042340|      1997|
|    210|      40|     3|891035994|      1998|
|    224|      29|     3|888104457|      1998|
|    303|     785|     3|879485318|      1997|
|    122|     387|     5|879270459|      1997|
|    194|     274|     2|879539794|      1997|
|    291|    

Информация о каждом фильме включает в себя название, даты выхода в прокат и появления в магазинах, ссылку на IMDB и жанр. Причем один фильм может быть отнесен к нескольким жанрам. 

In [10]:
movies = spark.read.csv('ml-100k/u.item', inferSchema=True, sep='|')
names = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
for i in range(len(names)):
    movies = movies.withColumnRenamed('_c'+str(i), names[i])


In [11]:
from pyspark.sql.functions import to_date

In [12]:
movies = movies.withColumn('year_released', year(to_date('release_date', 'dd-MMM-yyyy')))

In [13]:
movies.show()

+--------+--------------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+-------------+
|movie_id|         movie_title|release_date|video_release_date|            IMDb_URL|unknown|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|year_released|
+--------+--------------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+-------------+
|       1|    Toy Story (1995)| 01-Jan-1995|              null|http://us.imdb.co...|      0|     0|        0|        1|         1|     1|    0|          0|    0|      0|        0|     0|      0|      0|      0|     0|       0|  0|      0|   

Информация о каждом пользователе включает в себя возраст, пол, профессию и почтовый индекс.

In [14]:
users = spark.read.csv('ml-100k/u.user', inferSchema=True, sep='|')
names = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
for i in range(len(names)):
    users = users.withColumnRenamed('_c'+str(i), names[i])
users.show()

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      1| 24|     M|   technician|   85711|
|      2| 53|     F|        other|   94043|
|      3| 23|     M|       writer|   32067|
|      4| 24|     M|   technician|   43537|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|   05201|
|      9| 29|     M|      student|   01002|
|     10| 53|     M|       lawyer|   90703|
|     11| 39|     F|        other|   30329|
|     12| 28|     F|        other|   06405|
|     13| 47|     M|     educator|   29206|
|     14| 45|     M|    scientist|   55106|
|     15| 49|     F|     educator|   97301|
|     16| 21|     M|entertainment|   10309|
|     17| 30|     M|   programmer|   06355|
|     18| 35|     F|        other|   37212|
|     19| 40|     M|    librarian|   02138|
|     20| 42|     F|    homemake

In [15]:
stringIndexer = StringIndexer(inputCol="gender", outputCol="indexed_gender", handleInvalid='error')
model = stringIndexer.fit(users)
users = model.transform(users)
encoder = OneHotEncoder(inputCol="indexed_gender", outputCol='gender_feature')
users = encoder.transform(users)

stringIndexer = StringIndexer(inputCol="occupation", outputCol="indexed_occupation", handleInvalid='error')
model = stringIndexer.fit(users)
users = model.transform(users)
encoder = OneHotEncoder(inputCol="indexed_occupation", outputCol='occupation_feature')
users = encoder.transform(users)

In [16]:
users.show()

+-------+---+------+-------------+--------+--------------+--------------+------------------+------------------+
|user_id|age|gender|   occupation|zip_code|indexed_gender|gender_feature|indexed_occupation|occupation_feature|
+-------+---+------+-------------+--------+--------------+--------------+------------------+------------------+
|      1| 24|     M|   technician|   85711|           0.0| (1,[0],[1.0])|              11.0|   (20,[11],[1.0])|
|      2| 53|     F|        other|   94043|           1.0|     (1,[],[])|               1.0|    (20,[1],[1.0])|
|      3| 23|     M|       writer|   32067|           0.0| (1,[0],[1.0])|               7.0|    (20,[7],[1.0])|
|      4| 24|     M|   technician|   43537|           0.0| (1,[0],[1.0])|              11.0|   (20,[11],[1.0])|
|      5| 33|     F|        other|   15213|           1.0|     (1,[],[])|               1.0|    (20,[1],[1.0])|
|      6| 42|     M|    executive|   98101|           0.0| (1,[0],[1.0])|               8.0|    (20,[8],

In [17]:
genres = ['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

Для получения целостного датасета было принято решение собрать датафреймы с оценками и информацией о фильмах и пользователях в один.

In [18]:
joined_df = ratings.join(users, ["user_id"], 'outer').drop('zip_code', 'timestamp')
joined_df = joined_df.join(movies, ["movie_id"], 'outer').drop('movie_title', 'release_date', 'video_release_date', 'IMDb_URL')

In [19]:
from pyspark.sql.functions import when

In [20]:
joined_df = joined_df.withColumn('rated_released_year_diff', when(joined_df.year_rated - joined_df.year_released > 0, joined_df.year_rated - joined_df.year_released ).otherwise(0))

In [21]:
joined_df.show()

+--------+-------+------+----------+---+------+----------+--------------+--------------+------------------+------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+-------------+------------------------+
|movie_id|user_id|rating|year_rated|age|gender|occupation|indexed_gender|gender_feature|indexed_occupation|occupation_feature|unknown|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|year_released|rated_released_year_diff|
+--------+-------+------+----------+---+------+----------+--------------+--------------+------------------+------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+-------------+------------------------+
|     148|    251|     2|      1998| 28|

Категориальные значения (профессия, пол) были оформлены при помощи one-hot encoding.

In [22]:


# mmScaler = MinMaxScaler(inputCol="age", outputCol="scaled_age")
# model = mmScaler.fit(joined_df)
# model.transform(joined_df)

# joined_df = joined_df.drop("user_id", "movie_id", "gender", "indexed_gender", "occupation", "indexed_occupation", "age")
# joined_df.show()
joined_df.show()

+--------+-------+------+----------+---+------+----------+--------------+--------------+------------------+------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+-------------+------------------------+
|movie_id|user_id|rating|year_rated|age|gender|occupation|indexed_gender|gender_feature|indexed_occupation|occupation_feature|unknown|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|year_released|rated_released_year_diff|
+--------+-------+------+----------+---+------+----------+--------------+--------------+------------------+------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+-------------+------------------------+
|     148|    251|     2|      1998| 28|

In [23]:
avg_ratings_df = joined_df.select(['user_id', 'rating']  + genres)

for genre in genres:
    avg_ratings_df = avg_ratings_df.withColumn('rating {}'.format(genre), avg_ratings_df[genre] * avg_ratings_df.rating)

genre_ratings = list(map(lambda genre: 'rating {}'.format(genre), genres))
df1 = avg_ratings_df.groupBy('user_id').sum(*genres)
df2 = avg_ratings_df.groupBy('user_id').sum(*genre_ratings)

avg_ratings_df = df1.join(df2, ['user_id'])
for genre in genres:
    sum_rating_col = 'sum(rating {})'.format(genre)
    count_col = 'sum({})'.format(genre)
    avg_ratings_df = avg_ratings_df.withColumn('avg({})'
                                   .format(genre), avg_ratings_df[sum_rating_col] / avg_ratings_df[count_col])
    avg_ratings_df = avg_ratings_df.drop(sum_rating_col, count_col)

avg_ratings_df = avg_ratings_df.fillna(value=0)
avg_ratings_df.show()
avg_ratings_df.toPandas().to_csv('~/avg_ratings.csv')

+-------+------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------+
|user_id|avg(unknown)|       avg(Action)|    avg(Adventure)|    avg(Animation)|   avg(Children's)|       avg(Comedy)|        avg(Crime)|  avg(Documentary)|        avg(Drama)|avg(Fantasy)|   avg(Film-Noir)|       avg(Horror)|      avg(Musical)|      avg(Mystery)|      avg(Romance)|       avg(Sci-Fi)|     avg(Thriller)|          avg(War)|avg(Western)|
+-------+------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------+-----------------+------------------+------------------+------------------+------------------+---------------

In [24]:
average_movie_rating = ratings.select('movie_id', 'rating')
average_movie_rating = average_movie_rating.groupBy('movie_id').avg('rating')

average_movie_rating.toPandas().to_csv('~/average_movie_rating.csv')
average_movie_rating.show()

+--------+------------------+
|movie_id|       avg(rating)|
+--------+------------------+
|     496| 4.121212121212121|
|     471|3.6108597285067874|
|     463| 3.859154929577465|
|     148|          3.203125|
|    1342|               2.5|
|     833| 3.204081632653061|
|    1088| 2.230769230769231|
|    1591|3.1666666666666665|
|    1238|             3.125|
|    1580|               1.0|
|    1645|               4.0|
|     392|3.5441176470588234|
|     623| 2.923076923076923|
|     540| 2.511627906976744|
|     858|               1.0|
|     737| 2.983050847457627|
|     243|2.4393939393939394|
|    1025|2.9318181818181817|
|    1084| 3.857142857142857|
|    1127| 2.909090909090909|
+--------+------------------+
only showing top 20 rows



In [25]:
popularity = joined_df.select('movie_id')
popularity = popularity.groupBy('movie_id').count()
popularity = popularity.withColumnRenamed('count', 'popularity').fillna(0)

popularity.toPandas().to_csv('~/movie_popularity.csv')
popularity.show()

+--------+----------+
|movie_id|popularity|
+--------+----------+
|     148|       128|
|     463|        71|
|     471|       221|
|     496|       231|
|     833|        49|
|    1088|        13|
|    1238|         8|
|    1342|         2|
|    1580|         1|
|    1591|         6|
|    1645|         1|
|     243|       132|
|     392|        68|
|     540|        43|
|     623|        39|
|     737|        59|
|     858|         3|
|     897|         2|
|    1025|        44|
|    1084|        21|
+--------+----------+
only showing top 20 rows



In [26]:
joined_df = joined_df.join(average_movie_rating, 'movie_id')

In [27]:
joined_df = joined_df.join(popularity, ['movie_id'])

In [28]:
joined_df = joined_df.join(avg_ratings_df, ['user_id'])
final_df = joined_df.drop("gender", "indexed_gender", "occupation", "indexed_occupation", "age")

In [29]:
# joined_df.popularity

In [30]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes

In [31]:
answer_col = 'rating'
features_col = 'features'
input_cols = [col for col in final_df.columns if col not in [answer_col, "user_id", "movie_id"]]
assembler = VectorAssembler(inputCols=input_cols, outputCol=features_col)

print(input_cols)

['year_rated', 'gender_feature', 'occupation_feature', 'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'year_released', 'rated_released_year_diff', 'avg(rating)', 'popularity', 'avg(unknown)', 'avg(Action)', 'avg(Adventure)', 'avg(Animation)', "avg(Children's)", 'avg(Comedy)', 'avg(Crime)', 'avg(Documentary)', 'avg(Drama)', 'avg(Fantasy)', 'avg(Film-Noir)', 'avg(Horror)', 'avg(Musical)', 'avg(Mystery)', 'avg(Romance)', 'avg(Sci-Fi)', 'avg(Thriller)', 'avg(War)', 'avg(Western)']


In [32]:
data = assembler.setHandleInvalid("skip").transform(final_df)
data = data.select(answer_col,features_col, 'user_id', 'movie_id')
data.show(truncate=False)

+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+--------+
|rating|features                                                                                                                                                                                                                                                                                                                        |user_id|movie_id|
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [41]:
import numpy as np

In [126]:
# kmapper_data = np.array(final_df.collect())
kmapper_data = np.array(data.select(features_col).collect())

In [127]:
print(kmapper_data[0])
print(np.shape(kmapper_data.reshape(-1, 64)))
kmapper_data = kmapper_data.reshape(-1, 64)

[[1.99700000e+03 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.94600000e+03 5.10000000e+01 4.12121212e+00
  2.31000000e+02 0.00000000e+00 3.91666667e+00 3.93333333e+00
  4.42857143e+00 3.80000000e+00 4.45000000e+00 3.66666667e+00
  0.00000000e+00 3.68000000e+00 0.00000000e+00 5.00000000e+00
  2.00000000e+00 4.45454545e+00 3.00000000e+00 4.33333333e+00
  4.46153846e+00 3.60000000e+00 4.10000000e+00 2.00000000e+00]]
(99991

In [128]:
custom_tooltips = np.array(data.join(movies, ["movie_id"], 'inner').select('movie_title').collect()).reshape(-1)

In [129]:
print(np.shape(custom_tooltips))

(99991,)


In [130]:
# Import the class
import umap
import kmapper as km
from kmapper import jupyter

# Some sample data
import sklearn
from sklearn import datasets

In [185]:
def visualize(data, custom_tooltips, items_count=100, filename="movies.html", title="MOVIES", show=True):
    # Initialize
    mapper = km.KeplerMapper(verbose=1)

    # Fit to and transform the data
    projected_data = mapper.fit_transform(data[:items_count], projection=umap.UMAP(n_neighbors=8,
                                                                     min_dist=0.65,
                                                                     n_components=2,
                                                                     metric='euclidean',
                                                                     random_state=3571))

    # Create dictionary called 'graph' with nodes, edges and meta-information
    graph = mapper.map(projected_data, clusterer=sklearn.cluster.DBSCAN(eps=0.3, min_samples=15),
                          cover=km.Cover(35, 0.9), remove_duplicate_nodes=True)

    # Visualize it
    html = mapper.visualize(graph, path_html=filename,
                     title=title, custom_tooltips=custom_tooltips[:items_count])

    # Inline display
    # jupyter.display(path_html="http://mlwave.github.io/tda/word2vec-gender-bias.html")
    if show:
        jupyter.display(path_html=filename)

In [148]:
visualize(kmapper_data, custom_tooltips, items_count=50, filename="movies_50.html")

KeplerMapper()
..Composing projection pipeline of length 1:
	Projections: UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.65, n_components=2, n_epochs=None,
   n_neighbors=8, negative_sample_rate=5, random_state=3571,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=False)
	Distance matrices: False
	Scalers: MinMaxScaler(copy=True, feature_range=(0, 1))
..Projecting on data shaped (50, 64)

..Projecting data using: 
	UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.65, n_components=2, n_epochs=None,
   n_neighbors=8, negative_sample_rate=5, random_state=3571,
   repulsion_strength=1.0, set_op_mix_r

Exception: Visualize requires a mapper with more than 0 nodes. 
It is possible that the constructed mapper could have been constructed with bad parameters. This can occasionally happens when using the default clustering algorithm. Try changing `eps` or `min_samples` in the DBSCAN clustering algorithm.

In [149]:
visualize(kmapper_data, custom_tooltips, items_count=70, filename="movies_70.html")

KeplerMapper()
..Composing projection pipeline of length 1:
	Projections: UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.65, n_components=2, n_epochs=None,
   n_neighbors=8, negative_sample_rate=5, random_state=3571,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=False)
	Distance matrices: False
	Scalers: MinMaxScaler(copy=True, feature_range=(0, 1))
..Projecting on data shaped (70, 64)

..Projecting data using: 
	UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.65, n_components=2, n_epochs=None,
   n_neighbors=8, negative_sample_rate=5, random_state=3571,
   repulsion_strength=1.0, set_op_mix_r

In [186]:
visualize(kmapper_data, custom_tooltips, items_count=100, filename="movies_100.html", show=False)

KeplerMapper()
..Composing projection pipeline of length 1:
	Projections: UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.65, n_components=2, n_epochs=None,
   n_neighbors=8, negative_sample_rate=5, random_state=3571,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=False)
	Distance matrices: False
	Scalers: MinMaxScaler(copy=True, feature_range=(0, 1))
..Projecting on data shaped (100, 64)

..Projecting data using: 
	UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.65, n_components=2, n_epochs=None,
   n_neighbors=8, negative_sample_rate=5, random_state=3571,
   repulsion_strength=1.0, set_op_mix_

In [184]:
visualize(kmapper_data, custom_tooltips, items_count=300, filename="movies_300.html", show=False)

KeplerMapper()
..Composing projection pipeline of length 1:
	Projections: UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.65, n_components=3, n_epochs=None,
   n_neighbors=8, negative_sample_rate=5, random_state=3571,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=False)
	Distance matrices: False
	Scalers: MinMaxScaler(copy=True, feature_range=(0, 1))
..Projecting on data shaped (300, 64)

..Projecting data using: 
	UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.65, n_components=3, n_epochs=None,
   n_neighbors=8, negative_sample_rate=5, random_state=3571,
   repulsion_strength=1.0, set_op_mix_

In [173]:
kmpapper_movies = np.array(movies.join(average_movie_rating, ['movie_id'], 'outer').join(popularity, ['movie_id'], 'outer').drop('movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL').collect())
kmpapper_movies_tooltips = np.array(movies.select('movie_title').collect()).reshape(-1)

In [183]:
visualize(kmpapper_movies, kmpapper_movies_tooltips, items_count=100, filename="movies_100_1.html", show=False)


KeplerMapper()
..Composing projection pipeline of length 1:
	Projections: UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.65, n_components=3, n_epochs=None,
   n_neighbors=8, negative_sample_rate=5, random_state=3571,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=False)
	Distance matrices: False
	Scalers: MinMaxScaler(copy=True, feature_range=(0, 1))
..Projecting on data shaped (100, 22)

..Projecting data using: 
	UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.65, n_components=3, n_epochs=None,
   n_neighbors=8, negative_sample_rate=5, random_state=3571,
   repulsion_strength=1.0, set_op_mix_

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1994,
       3.859154929577465, 71], dtype=object)