In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
# load data; current representation pandas dataframe
file = "IMDB-Movie-Data.csv"
data = pd.read_csv(file)

def data_preprocessing(data):
    # lower case everything; more preprocessing can be done 
    for column in ['Title', 'Genre', 'Description', 'Director', 'Actors']:
        data[column] = data[column].apply(lambda x: x.lower())
    data['Genre'] = data['Genre'].apply(lambda x: x.replace(',', ' '))
    return data

data = data_preprocessing(data)
data.sample(2)

In [52]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

## get query
#query = input("Enter Movie query: ")
#query = list(query.split())


# Retrieves the query from the user and returns it
def get_user_query():
    userInput = input("Enter search query: ")
    print("Literal query " + userInput)
    return userInput

# This function processes the user query
def query_representation_function(query):
    # Convert to lowercase
    query = query.lower()
    # Remove Punctuation ( !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ )
    query_removed_punctuation = "".join([char for char in query if char not in string.punctuation])
    # Tokenization
    query_tokenized = word_tokenize(query_removed_punctuation)
    # Stopword Filtering
    english_stopwords = stopwords.words('english')
    query_removed_stopwords = [word for word in query_tokenized if word not in english_stopwords]
    # Stemming
    porter = PorterStemmer()
    query_stemmed = [porter.stem(word) for word in query_removed_stopwords]
    
    return query_stemmed

def test_print():
    test_query = "This isn't a very long test query... I hope that I will be receiving a good answer! Give me results from the 2016 elections"
    print("Test query:\n" + test_query)
    print("Processed query:")
    print(query_representation_function(test_query))
    print("Test query after processing:\n" + test_query)

test_print()

Test query:
This isn't a very long test query... I hope that I will be receiving a good answer! Give me results from the 2016 elections
Processed query:
['isnt', 'long', 'test', 'queri', 'hope', 'receiv', 'good', 'answer', 'give', 'result', '2016', 'elect']
Test query after processing:
This isn't a very long test query... I hope that I will be receiving a good answer! Give me results from the 2016 elections


In [None]:
# scoring function
def get_score(row, query):
    score = 0
    for word in query:
        for col in ['Title', 'Genre', 'Description', 'Director', 'Actors', 'Year']:
            if word in str(row[col]):
                # for now matching a keyword in any column counts the same (insufficient for now ofc)
                score += 1    
    return score


def rank_movies(data, query):
    data['Score'] = 0
    for i, row in tqdm(data.iterrows()):
        score = get_score(row, query)
        data.at[i,'Score'] = score

    # first sort on Score then Rating
    data.sort_values(['Score', 'Rating'], ascending = False, inplace = True)
    
    return data
    
ranked_data = rank_movies(data, query)

# display top 5 movies
#ranked_data.head(5)

In [None]:
# display results
