# BDCC project - Lucia and Nirbhaya

**[Big Data and Cloud Computing](https://www.dcc.fc.up.pt/~edrdo/aulas/bdcc/projects/), Project 1**


## Spark setup

In [0]:
def setupSpark():
  # Spark needs to run with Java 8 ... 
  !pip install -q findspark
  !apt-get install openjdk-8-jdk-headless > /dev/null
  !echo 2 | update-alternatives --config java > /dev/null
  # !java -version
  import os, findspark
  os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
  # !echo JAVA_HOME=$JAVA_HOME
  !pip install -q pyspark
  findspark.init(spark_home='/usr/local/lib/python3.6/dist-packages/pyspark')
  !pyspark --version

setupSpark()

from pyspark import SparkContext
from pyspark.sql import SparkSession
    
spark = SparkSession\
        .builder\
        .master('local[*]')\
        .getOrCreate()
sc = spark.sparkContext

## Parameters

In [0]:

#@markdown ---
DEBUG = True #@param {type: "boolean"} 
PROJECT_ID = 'propane-primacy-268509'  #@param {type: "string"}
INPUT_BUCKET = 'bdcc1920_project_datasets' #@param {type: "string"}
DATASET = 'medium1' #@param ["tiny1", "tiny2", "tiny3", "tiny4", "medium1", "medium2", "medium3", "medium4", "large1", "large2", "large3", "large4", "large5"] {allow-input: true}
OUTPUT_BUCKET = 'up199502863_bdcc1920_project_outputs' #@param {type: "string"}
OUTPUT_ZIP_FILE = 'output.zip' #@param {type: "string"}
COPY_PARQUET_FILES_TO_OUTPUT_BUCKET = True #@param {type: "boolean"} 
MIN_TF_IDF = 0.1 #@param {type:"slider", min:0, max:1, step:0.05}
SEND_PUBSUB_MESSAGE = True #@param {type: "boolean"} 
PUBSUB_TOPIC = 'new_output' #@param {type: "string"}
cloudFunctionName = 'LCF' #@param {type: "string"}
#@markdown ---


## Authenticate to GCP

In [0]:
# The authentication method 
def google_colab_authenticate(projectId, keyFile=None, debug=True):  
    import os
    from google.colab import auth
    if keyFile == None:
      keyFile='/content/bdcc-colab.json'
    if os.access(keyFile,os.R_OK):
      if debug:
        print('Using key file "%s"' % keyFile)
      os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '%s' % keyFile
      os.environ['GCP_PROJECT'] = projectId 
      os.environ['GCP_ACCOUNT'] = 'bdcc-colab@' + projectId + '.iam.gserviceaccount.com'
      !gcloud auth activate-service-account "$GCP_ACCOUNT" --key-file="$GOOGLE_APPLICATION_CREDENTIALS" --project="$GCP_PROJECT"
    else:
      if debug:
        print('No key file given. You may be redirected to the verification code procedure.')
      auth.authenticate_user()
      !gcloud config set project $projectId
    !gcloud info | grep -e Account -e Project

# Copy key file from Google Drive if available 
# to a path without spaces (it usually creates problems)
!test -f "/content/drive/My Drive/bdcc-colab.json" && cp "/content/drive/My Drive/bdcc-colab.json" /content/bdcc-colab.json

google_colab_authenticate(PROJECT_ID)

## Transfer dataset files (if necessary) from GCS

In [0]:
!test -d $DATASET || gsutil -m cp -r gs://"$INPUT_BUCKET"/"$DATASET" .
!du --human $DATASET

## Parquet file read/write methods


In [0]:
def readParquet(file):
    global spark
    if DEBUG:
       print('==> Reading ' + file)
    df = spark.read.parquet(file)
    if DEBUG:
      df.printSchema()
      df.show(10)
    return df
 
def writeParquet(df,path):
    if DEBUG:
        print('==> Writing ' + path)
    !rm -fr $path
    df.write.parquet(path, mode='overwrite')


## Load input dataset from Parquet files

In [0]:
movies =  readParquet(DATASET + '/movies.parquet')
actors =  readParquet(DATASET + '/actors.parquet')
genres =  readParquet(DATASET + '/genres.parquet')
ratings = readParquet(DATASET + '/ratings.parquet')
tags =    readParquet(DATASET + '/tags.parquet')

## Calculate aggregate movie data for TF_IDF and tags+ratings data for JaccardIndex


In [0]:
movies.createOrReplaceTempView('movies')
ratings.createOrReplaceTempView('ratings')
tags.createOrReplaceTempView('tags')
ratings.createOrReplaceTempView('ratings')

tags_ratings = spark.sql(
    '''
    SELECT movieId,userId 
    FROM tags
    UNION ALL
    SELECT movieId,userId 
    FROM ratings
    '''
)

tags_ratings.createOrReplaceTempView('tags_ratings')

movie_pairs = spark.sql(
    '''
    SELECT m.movieId as mov1, m1.movieId as mov2
    FROM movies as m INNER JOIN movies as m1 ON m.movieId < m1.movieId
    '''
)

movie_pairs.createOrReplaceTempView('movie_pairs')

ratings_agg = spark.sql(
    '''
    SELECT movieId,
          COUNT(*) AS numRatings,
          AVG(rating) AS avgRating
    FROM ratings
    GROUP BY movieId
    '''
)
ratings_agg.createOrReplaceTempView('ratings_agg')

movies_agg = spark.sql(
    '''
    SELECT movieId, title, year, imdbId, 
           ifnull(numRatings, 0) as numRatings,
           ifnull(avgRating, 0.0) as avgRating
    FROM movies LEFT OUTER JOIN ratings_agg USING(movieId)
    ORDER BY movieId
    '''
)

#spark.catalog.dropGlobalTempView('agg_ratings')
#spark.catalog.dropGlobalTempView('ratings')
#ratings.unpersist()

## Derive words for TF-IDF processing

In [0]:
from pyspark.sql import functions as F

# Utility method to explode strings contained in the given field into a 'word' column
def wordDf(df, field):
  wdf = df.withColumn('word',
                      F.explode(F.split(F.lower(F.col(field)),'[ \s\*\+\&\/\%\-\$\#\'\)\(\[\[\],.!?;:\t\n"]+'))).\
                      drop(field)

  return wdf


actor_words = wordDf(actors, 'name')
genre_words = wordDf(genres,'genre')
tag_words = wordDf(tags.drop('userId'), 'tag')
title_words = wordDf(movies.select('movieId','title'), 'title')

all_words = actor_words.union(genre_words)\
                       .union(tag_words)\
                       .union(title_words)\
                       
#if DEBUG:
  #print(all_words.select("word").distinct().count())
  #all_words.orderBy('movieId','word').show()


## Calculate TF-IDF metric





In [0]:
import pyspark.sql.functions as F
df2= all_words.groupBy('movieId').agg(F.collect_list('word').alias('words'))
aggWords = all_words.groupBy('movieId', 'word')\
                    .agg(F.count('word').alias('numWords'))
max_numWords = aggWords.groupBy('movieId').agg(F.max('numWords').alias('max_numWords'))
TF = max_numWords.join(aggWords,'movieId')
TF2 = TF.withColumn("tf", F.col('numWords')/F.col("max_numWords"))
N_DOCS=df2.count()
IDF = TF2.groupBy('word').agg(F.count('movieId').alias('n'))
IDF2 = IDF.withColumn("idf", F.log2(N_DOCS/F.col("n")))
IDF3 = IDF2.join(TF2,'word')
TF_IDF = IDF3.withColumn("tf_idf", F.col('tf')*F.col("idf"))
TF_IDF2= TF_IDF.select("movieId","word","tf_idf")
TF_IDF2_MIN = TF_IDF2.filter(TF_IDF2.tf_idf>=MIN_TF_IDF)

In [0]:
TF_IDF2_MIN.show()

## Movie similarity based on the Jaccard index






In [0]:
from pyspark.sql.types import *

def jaccard(df):
  ji = []
  for i in range(df.count()):
    mp = df.collect()[i]
    intersection = spark.sql(
      '''
      SELECT userId from tags_ratings where tags_ratings.movieId = %s
      INTERSECT
      SELECT userId from tags_ratings where tags_ratings.movieId = %s
      ''' %(mp.mov1,mp.mov2)
    )  
    union = spark.sql(
      '''
      SELECT userId from tags_ratings where tags_ratings.movieId = %s
      UNION
      SELECT userId from tags_ratings where tags_ratings.movieId = %s
      ''' %(mp.mov1,mp.mov2)
    ) 
    ji.append([mp.mov1,mp.mov2,intersection.count()/union.count()])

  cSchema = StructType([StructField("movie1", IntegerType()),StructField("movie2", IntegerType()),StructField("j_Index", DoubleType())])
  return spark.createDataFrame(ji, schema=cSchema)
  
JaccardTable = jaccard(movie_pairs)

## Write output data to Parquet files and generate ZIP file

In [0]:
%%capture
# Clean up first
!rm -fr "$DATASET"/output 
!rm -f "$DATASET"/"$OUTPUT_ZIP_FILE"

if DEBUG:
  !ls -l $DATASET

if DEBUG:
  writeParquet(movies_agg, DATASET + '/output/' + 'movies_agg.parquet')
  writeParquet(TF_IDF2_MIN, DATASET + '/output/' + 'tf_idf.parquet')
  writeParquet(JaccardTable, DATASET + '/output/' + 'jaccard_index.parquet')

if DEBUG:
  print('Creating ZIP file ...')

!cd "$DATASET"/output  && zip -9qr ../"$OUTPUT_ZIP_FILE" .

if DEBUG:
  !ls -l $DATASET "$DATASET"/output

## Copy output ZIP file to output bucket

In [0]:
%%capture
! gsutil cp $DATASET/output.zip gs://"$OUTPUT_BUCKET"/"$DATASET"/output.zip 

## Copy Parquet files to output bucket (optional)

In [0]:
%%capture
if COPY_PARQUET_FILES_TO_OUTPUT_BUCKET:
  ! gsutil -m cp -r $DATASET/output/movies_agg.parquet gs://"$OUTPUT_BUCKET"/"$DATASET"/
  ! gsutil -m cp -r $DATASET/output/tf_idf.parquet gs://"$OUTPUT_BUCKET"/"$DATASET"/
  ! gsutil -m cp -r $DATASET/output/jaccard_index.parquet gs://"$OUTPUT_BUCKET"/"$DATASET"/

# Loader Cloud Function (LCF)

## (4) Output data → (5) BigQuery

In [0]:
# Imports
import base64
import pandas as pd
import os
import tempfile
import time
from zipfile import ZipFile
import google.cloud.bigquery as bq
import google.cloud.storage as gcs



DEBUG = True 
RUNNING_IN_COLAB = os.environ.get('COLAB_GPU') != None

TMP_DIR=tempfile.mkdtemp(prefix='LCF_')

# Debug method
def debug(message):
  if DEBUG:
     print(message)

# Authenticate to GCP if running in Colab
if RUNNING_IN_COLAB:
  google_colab_authenticate(PROJECT_ID)

# Initialize interface to BigQuery and GCS
BQ_CLIENT = bq.Client(PROJECT_ID)
GCS_CLIENT = gcs.Client(PROJECT_ID)
BUCKET = gcs.Bucket(GCS_CLIENT, OUTPUT_BUCKET)

def get_data_from_cloud_storage(dataset_id):
  bucket_path = '%s/%s' % (dataset_id, OUTPUT_ZIP_FILE)
  local_zip_file = '%s/%s' % (TMP_DIR, OUTPUT_ZIP_FILE)
  debug('Downloading gs://%s/%s to %s' % (OUTPUT_BUCKET,bucket_path,local_zip_file))
  blob = gcs.Blob(bucket_path, BUCKET)
  with open(local_zip_file, 'wb') as out:
    blob.download_to_file(out)
  

def unzip_data_file():
  local_zip_file = '%s/%s' % (TMP_DIR, OUTPUT_ZIP_FILE)
  debug('Unzipping %s' % local_zip_file)
  with ZipFile(local_zip_file) as zf:
    zf.extractall(TMP_DIR)
  debug('Unzipping done')

def load_movie_agg_data(dataset_id):
  tid = 'movies_agg'
  table_name = '%s.%s.%s' % (PROJECT_ID, dataset_id, tid)
  
  # Read parquet file
  parquet_files_path = '%s/%s.parquet' % (TMP_DIR, tid)
  debug('Reading Parquet files from %s' % parquet_files_path)
  pdf = pd.read_parquet(parquet_files_path)
  debug(str(pdf.head(5)))


  # Create BigQuery table
  table = bq.Table(table_name)
  
  table.schema = (
        bq.SchemaField("movieId", "INTEGER", "REQUIRED"),
        bq.SchemaField("title",  "STRING", "REQUIRED"),
        bq.SchemaField("year", "INTEGER", "REQUIRED"),
        bq.SchemaField("imdbId", "INTEGER", "REQUIRED"),
        bq.SchemaField("numRatings", "INTEGER", "REQUIRED"),
        bq.SchemaField("avgRating", "FLOAT", "REQUIRED"),
  )
  debug('Creating %s' % table_name)
  BQ_CLIENT.create_table(table)

  debug('Populating %s with %d rows' % (table_name, len(pdf)))
  load_job = BQ_CLIENT.load_table_from_dataframe(pdf, table)

  while load_job.running():
     debug('waiting for load job to complete')
     time.sleep(1)

  debug('Done with table %s' % table_name)

def load_tfidf_data(dataset_id):
  tid = 'tf_idf'
  table_name = '%s.%s.%s' % (PROJECT_ID, dataset_id, tid)
  
  # Read parquet file
  parquet_files_path = '%s/%s.parquet' % (TMP_DIR, tid)
  debug('Reading Parquet files from %s' % parquet_files_path)
  pdf = pd.read_parquet(parquet_files_path)
  debug(str(pdf.head(5)))


  # Create BigQuery table
  table = bq.Table(table_name)
  
  table.schema = (
        bq.SchemaField("movieId", "INTEGER", "REQUIRED"),
        bq.SchemaField("word",  "STRING", "REQUIRED"),
        bq.SchemaField("tf_idf", "FLOAT", "REQUIRED"),
  )
  debug('Creating %s' % table_name)
  BQ_CLIENT.create_table(table)

  debug('Populating %s with %d rows' % (table_name, len(pdf)))
  load_job = BQ_CLIENT.load_table_from_dataframe(pdf, table)
  while load_job.running():
     debug('waiting for load job to complete')
     time.sleep(1)

  debug('Done with table %s' % table_name)

def load_jaccard_index(dataset_id):
  tid = 'jaccard_index'
  table_name = '%s.%s.%s' % (PROJECT_ID, dataset_id, tid)
  
  # Read parquet file
  parquet_files_path = '%s/%s.parquet' % (TMP_DIR, tid)
  debug('Reading Parquet files from %s' % parquet_files_path)
  pdf = pd.read_parquet(parquet_files_path)
  debug(str(pdf.head(5)))


  # Create BigQuery table
  table = bq.Table(table_name)
  
  table.schema = (
        bq.SchemaField("movie1", "INTEGER", "REQUIRED"),
        bq.SchemaField("movie2", "INTEGER", "REQUIRED"),
        bq.SchemaField("j_Index", "FLOAT", "REQUIRED"),
  )
  debug('Creating %s' % table_name)
  BQ_CLIENT.create_table(table)

  debug('Populating %s with %d rows' % (table_name, len(pdf)))
  load_job = BQ_CLIENT.load_table_from_dataframe(pdf, table)
  while load_job.running():
     debug('waiting for load job to complete')
     time.sleep(1)

  debug('Done with table %s' % table_name)


def handle_pubsub_message(event, context):
  debug('Event: %s' % event)
  debug('Context: %s' % context)

  if RUNNING_IN_COLAB:
    dataset_id = event['data']
  else:
    dataset_id =  base64.b64decode(event['data']).decode('utf-8')
  
  debug('Dataset: %s' % dataset_id)
  
  get_data_from_cloud_storage(dataset_id)
  unzip_data_file()
  
  debug('Deleting previous BiqQuery dataset (if any)')

  BQ_CLIENT.delete_dataset(dataset_id, delete_contents = True, not_found_ok = True)
  BQ_CLIENT.create_dataset(dataset_id)

  debug('Created BiqQuery dataset')

  load_movie_agg_data(dataset_id)
  load_tfidf_data(dataset_id)
  load_jaccard_index(dataset_id)
  debug('Done for data set %s' % dataset_id)


In [0]:
#handle_pubsub_message({'data': DATASET}, None)

### (3) PubSub Trigger - Send PubSub cloud message 

This will trigger the LCF cloud function. 

In [0]:
!gcloud pubsub topics publish "$PUBSUB_TOPIC" --message "$DATASET"

In [0]:
!gcloud functions logs read $cloudFunctionName --limit 1000 

# Search Queries

In [0]:
# Imports
import os
import pandas as pd
import google.cloud.bigquery as bq

DEBUG = True 
RUNNING_IN_COLAB = os.environ.get('COLAB_GPU') != None

# Debug method
def debug(message):
  if DEBUG:
     print(message)

# Authenticate to GCP if running in Colab
if RUNNING_IN_COLAB:
  google_colab_authenticate(PROJECT_ID)

# Initialize interface to BigQuery and GCS
BQ_CLIENT = bq.Client(PROJECT_ID)

def list_movies(request):
  ds_id = '%s.%s' % (PROJECT_ID, request.args.get('dataset'))
  query = BQ_CLIENT.query(
      '''
      SELECT * FROM `%s.movies_agg` 
      ORDER BY movieId
      LIMIT %s
      ''' % (ds_id, request.args.get('max_results')))
  df = query.to_dataframe()
  debug('Returning result with %d rows' % len(df))
  return df.to_html()

  
def list_tfidf(request):
  ds_id = '%s.%s' % (PROJECT_ID, request.args.get('dataset'))
  query = BQ_CLIENT.query(
      '''
      SELECT * FROM `%s.tf_idf` 
      ORDER BY movieId,word
      LIMIT %s
      ''' % (ds_id, request.args.get('max_results')))
  df = query.to_dataframe()
  debug('Returning result with %d rows' % len(df))
  return df.to_html()

def tfidf_search(request):
  
  li = tuple(map(str, request.args.get('words').split(' '))) 
  ds_id = '%s.%s' % (PROJECT_ID, request.args.get('dataset'))
  query = BQ_CLIENT.query(
      '''
      SELECT tf.movieId, ma.title, AVG(tf_idf) as Average_TFIDF FROM `%s.tf_idf` as tf,`%s.movies_agg` as ma
      WHERE word in %s AND tf.movieId = ma.movieId
      GROUP BY tf.movieId,ma.title
      ORDER BY Average_TFIDF DESC
      LIMIT %s
      ''' % (ds_id,ds_id, li, request.args.get('max_results')))
  df = query.to_dataframe()
  debug('Returning result with %d rows' % len(df))
  return df.to_html()

def jaccard_index_search(request):
  movieId = request.args.get('movieId')
  ds_id = '%s.%s' % (PROJECT_ID, request.args.get('dataset'))
  query = BQ_CLIENT.query(
      '''
      SELECT movie2 as Similar_Movie, j_Index as Jaccard_Index FROM `%s.jaccard_index`
      where movie1 = %s
      UNION ALL
      SELECT movie1 as Similar_Movie, j_Index as Jaccard_Index FROM `%s.jaccard_index`
      where movie2 = %s
      ORDER BY Jaccard_Index DESC
      LIMIT %s
      ''' % (ds_id, movieId, ds_id, movieId,request.args.get('max_results')))
  df = query.to_dataframe()
  debug('Returning result with %d rows' % len(df))
  return df.to_html()

def weighted_search(request):
  
  li = tuple(map(str, request.args.get('words').split(' '))) 
  ds_id = '%s.%s' % (PROJECT_ID, request.args.get('dataset'))
  query1 = BQ_CLIENT.query(
      '''
      SELECT count(avgRating) as count from %s.movies_agg
      ''' % (ds_id))
  query2 = BQ_CLIENT.query(
      '''
      SELECT max(numRatings) as maximum from %s.movies_agg
      ''' % (ds_id))
  c = query1.to_dataframe().at[0,'count']
  m = query2.to_dataframe().at[0,'maximum']
  
  query3 = BQ_CLIENT.query(
      '''
      SELECT tf.movieId, movies.title,

      0.5*AVG(tf.tf_idf)/LOG(%s,2) +
      0.5*AVG(movies.avgRating)*LOG(SUM(movies.numRatings+0.01),2)/(5*LOG(%s,2)) 
      AS Average_Weights

      FROM %s.movies_agg movies JOIN %s.tf_idf tf ON (movies.movieId=tf.movieId)
      WHERE tf.word in %s
      GROUP BY tf.movieId, movies.title
      ORDER BY Average_Weights DESC LIMIT %s;
      ''' % (c,m,ds_id,ds_id,li,request.args.get('max_results')))

  df = query3.to_dataframe()
  debug('Returning result with %d rows' % len(df))
  return df.to_html()

def handle_request(request):

  if not request.args:
    debug('No arguments given!')
    return 'ERROR: No arguments'
  
  if 'dataset' not in request.args:
    debug('No dataset specified!')
    return 'ERROR: No dataset has been specified'
  
  if 'op' not in request.args:
    debug('No operation specified!')
    return 'ERROR: No operation has been specified'

  if 'max_results' not in request.args:
    debug('No result limit specified!')
    return 'ERROR: No result limit has been specified'

  operations = {
     'list_movies': list_movies,
     'list_tfidf': list_tfidf,
     'tfidf_search': tfidf_search,
     'jaccard_index_search' : jaccard_index_search,
     'weighted_search': weighted_search
  }
  dataset = request.args.get('dataset')
  op = request.args.get('op')
  debug('dataset: %s, op: %s' % (dataset,op))
  func = operations.get(op, lambda req: 'Invalid operation: %s' % op)
  return func(request)


In [12]:
dataset = 'medium1' #@param ["tiny1", "tiny2", "tiny3", "tiny4", "medium1", "medium2", "medium3", "medium4", "large1", "large2", "large3", "large4", "large5"] {allow-input: true}
max_results = 10 #@param {type:"slider", min:10, max:1000, step:10}

from IPython.core.display import HTML

class ListMoviesReq:
   args = { 'op': 'list_movies',\
            'dataset': dataset,\
            'max_results': max_results \
           }
  
HTML(handle_request(ListMoviesReq()))

dataset: medium1, op: list_movies
Returning result with 10 rows


Unnamed: 0,movieId,title,year,imdbId,numRatings,avgRating
0,24,Powder,1995,114168,9191,3.179306
1,888,The Land Before Time III: The Time of the Great Giving,1995,113596,799,2.319775
2,944,Lost Horizon,1937,29162,1147,3.819965
3,1102,American Strays,1996,115531,86,2.610465
4,1176,La double vie de Véronique,1991,101765,1972,3.889452
5,1483,Crash,1996,115964,3313,3.126019
6,1687,The Jackal,1997,119395,5644,3.202339
7,1920,Small Soldiers,1998,122718,4698,2.826415
8,1939,The Best Years of Our Lives,1946,36868,2045,4.082396
9,1993,Child's Play 3,1991,103956,1114,2.041293


In [13]:
dataset = 'medium1' #@param ["tiny1", "tiny2", "tiny3", "tiny4", "medium1", "medium2", "medium3", "medium4", "large1", "large2", "large3", "large4", "large5"] {allow-input: true}
max_results = 100 #@param {type:"slider", min:100, max:1000, step:100}

class ListTFIDFReq:
   args = { 'op': 'list_tfidf',\
           'dataset': dataset,\
           'max_results': max_results\
           }
HTML(handle_request(ListTFIDFReq()))

dataset: medium1, op: list_tfidf
Returning result with 100 rows


Unnamed: 0,movieId,word,tf_idf
0,24,a,0.538045
1,24,age,1.059592
2,24,albino,7.357552
3,24,all,0.925033
4,24,allegory,1.392925
5,24,animal,0.864605
6,24,apocalyptic,1.128765
7,24,at,1.059592
8,24,boy,1.059592
9,24,can,1.226259


In [0]:
dataset = 'medium1' #@param ["tiny1", "tiny2", "tiny3", "tiny4", "medium1", "medium2", "medium3", "medium4", "large1", "large2", "large3", "large4", "large5"] {allow-input: true}
words = 'comedy magic children'  #@param {type: "string"}
max_results = 25 #@param {type:"slider", min:5, max:100, step:5}

class TFIDFSearch:
   args = { 
            'op': 'tfidf_search',      \
            'dataset': DATASET,        \
            'words': words,            \
            'max_results': max_results \
          }
  
HTML(handle_request(TFIDFSearch()))

dataset: medium1, op: tfidf_search
Returning result with 25 rows


Unnamed: 0,movieId,title,Average_TFIDF
0,122982,Shônen Sarutobi Sasuke,4.035624
1,145964,Children of Eve,4.035624
2,177369,Tri tolstyaka,4.035624
3,183387,Sorochinskaya yarmarka,4.035624
4,153332,Snegurochka,4.035624
5,184963,Taking Flight,4.035624
6,138948,Aladdin and the Death Lamp,3.386295
7,105217,Zambezia,3.000429
8,126482,Strange Magic,2.029449
9,152137,Pirate's Passage,2.017812


In [0]:
dataset = 'medium1' #@param ["tiny1", "tiny2", "tiny3", "tiny4", "medium1", "medium2", "medium3", "medium4", "large1", "large2", "large3", "large4", "large5"] {allow-input: true}
words = 'comedy magic children'  #@param {type: "string"}
max_results = 25 #@param {type:"slider", min:5, max:100, step:5}

class WeightedSearch:
   args = { 
            'op': 'weighted_search',   \
            'dataset': dataset,        \
            'words': words,            \
            'max_results': max_results \
          }
  
HTML(handle_request(WeightedSearch()))

dataset: medium1, op: weighted_search
Returning result with 25 rows


Unnamed: 0,movieId,title,Average_Weights
0,68157,Inglourious Basterds,0.492039
1,3052,Dogma,0.416312
2,2890,Three Kings,0.402918
3,95441,Ted,0.329642
4,177369,Tri tolstyaka,0.309873
5,105217,Zambezia,0.306577
6,45431,Over the Hedge,0.30517
7,5942,The Hot Chick,0.304745
8,4090,The Brave Little Toaster,0.300998
9,7048,Nothing to Lose,0.294425


In [18]:
dataset = 'tiny4' #@param ["tiny1", "tiny2", "tiny3", "tiny4", "medium1", "medium2", "medium3", "medium4", "large1", "large2", "large3", "large4", "large5"] {allow-input: true}
movieId = 4  #@param {}
max_results = 100 #@param {type:"slider", min:100, max:1000, step:100}

class JISearch:
   args = { 
            'op': 'jaccard_index_search',      \
            'dataset': dataset,        \
            'movieId': movieId,           \
            'max_results': max_results \
         
          }
HTML(handle_request(JISearch()))


dataset: tiny4, op: jaccard_index_search
Returning result with 49 rows


Unnamed: 0,Similar_Movie,Jaccard_Index
0,46,0.166176
1,27,0.106842
2,45,0.104952
3,20,0.097149
4,31,0.084462
5,22,0.082092
6,15,0.081314
7,23,0.080335
8,42,0.079823
9,11,0.079697
