In [63]:
import pandas as pd
from tokenize import tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

In [25]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michalpurtak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
stop_words = set(stopwords.words('english'))

In [237]:
films_df[:5]

Unnamed: 0,name,abstract
0,12 Monkeys,12 Monkeys is a 1995 American neo-noir science...
1,Airplane!,"Airplane! (titled Flying High! in Australia, N..."
2,American Beauty,American Beauty is a 1999 American drama film ...
3,Amélie,Amélie (also known as Le Fabuleux Destin d'Amé...
4,Andrzej Wajda,Andrzej Witold Wajda (Polish: [ˈandʐɛj ˈvajda]...


In [245]:
def remove_stop_words(row):
    return [w for w in row['abstract'] if not w in stop_words and not w in string.punctuation]

In [239]:
def tokenize(row):
    return word_tokenize(row['abstract'])

In [241]:
def stemWords(row):
    porter = SnowballStemmer("english", ignore_stopwords=True)
    words = []
    for word in row['abstract']:
        words.append(porter.stem(word))
    return words

In [None]:
#TO DO
games_df = pd.read_csv("games.csv")

In [251]:
films_df = pd.read_csv("films.csv")
tokenized_abtracts_films = films_df.apply(tokenize, axis=1)
films_tokenized_df = pd.DataFrame({'name':films_df['name'], 'abstract':tokenized_abtracts_films})
stemmed_abstracts_films = films_tokenized_df.apply(stemWords, axis=1)
films_tokenized_stemmed_df = pd.DataFrame({'name':films_tokenized_df['name'], 'abstract':stemmed_abstracts_films})
stopwords_removed_abstracts_films = films_tokenized_stemmed_df.apply(remove_stop_words, axis=1)
films_tokenize_stemmed_df = pd.DataFrame({'name':films_tokenized_df['name'], 'abstract':stopwords_removed_abstracts_films})

In [264]:
books_df = pd.read_csv("books.csv")
tokenized_abstracts = books_df.apply(tokenize, axis=1)
books_tokenized_df = pd.DataFrame({'name':books_df['name'], 'abstract':tokenized_abstracts})
books_tokenized_df.to_csv("books_tokenized.csv")
stemmed_abstracts = books_tokenized_df.apply(stemWords, axis = 1)
books_tokenized_stemmed_df = pd.DataFrame({'name':books_tokenized_df['name'], 'abstract':stemmed_abstracts})
stopwords_removed_abstracts = books_tokenized_stemmed_df.apply(remove_stop_words, axis=1)
books_tokenized_stemmed_df = pd.DataFrame({'name':books_tokenized_df['name'], 'abstract':stopwords_removed_abstracts})

In [221]:
books_tokenized_stemmed_df.to_csv("books_tokenized_stemmed.csv")

In [268]:
def create_search_index(dataset_tokenized_stemmed_df):
    titles = dataset_tokenized_stemmed_df['name']
    abstracts = dataset_tokenized_stemmed_df['abstract']
    search_index = dict()
    number_of_doc = 0
    for abstract in abstracts:
        for word in abstract:
            if word not in search_index:
                search_index[word] = dict()
            if titles[number_of_doc] not in search_index[word]:
                search_index[word][titles[number_of_doc]] = 1
            else:
                search_index[word][titles[number_of_doc]] += 1
        number_of_doc += 1
    return search_index

In [258]:
def get_max_freq(document, search_index):
    max_freq = 0
    for key in search_index.keys():
        try:
            frequency = search_index[key][document]
        except KeyError:
            continue
        if frequency > max_freq:
            max_freq = frequency
    return max_freq

In [259]:
from math import log10
def get_idf(term, search_index):
    n_t = len(list(search_index[term]))
    n = len(book_names)
    return log10(n_t/n)
    

In [260]:
def calculate_tf_idf(term, document, search_index):
    freq = search_index[term][document]
    tf = freq/get_max_freq(document, search_index)
    idf = get_idf(term, search_index)
    return tf*idf
    

In [275]:
import copy
def create_tf_idf_index(search_index):    
    search_index_tf_idf = copy.deepcopy(search_index)
    for term in search_index_tf_idf.keys():
        for document in search_index_tf_idf[term]:
            freq = search_index_tf_idf[term][document]
            search_index_tf_idf[term][document] = (freq, calculate_tf_idf(term, document, search_index))
    return search_index_tf_idf


In [278]:
search_index_films = create_search_index(films_tokenized_stemmed_df)
search_index_books = create_search_index(books_tokenized_stemmed_df)
search_index_films_tf_idf = create_tf_idf_index(search_index_films)
search_index_books_tf_idf = create_tf_idf_index(search_index_books)

In [286]:
def execute_query(query, search_index_tf_idf):
    try:
        query_tokenized = word_tokenize(query)
        porter = SnowballStemmer("english", ignore_stopwords=True)
        query_tokenized_stemmed = []
            
        for word in query_tokenized:
            query_tokenized_stemmed.append(porter.stem(word))
        search_results_for_each_term_tf_idf = [search_index_tf_idf[term] for term in query_tokenized_stemmed]
        search_results_for_each_term_tf_idf_set = [set(search_index_tf_idf[term].keys()) for term in query_tokenized_stemmed]
        search_results_for_query = set.intersection(*search_results_for_each_term_tf_idf_set)
        
        results = []
        for document in search_results_for_query:
            tf_idf = 0
            for term_search_result in search_results_for_each_term_tf_idf:
                tf_idf += term_search_result[document][1]
            results.append((document, tf_idf))
        results = sorted(results, key = lambda x: x[1], reverse=True)
        return results
    except KeyError:
        return "no results"

In [288]:
execute_query("player", search_index_films_tf_idf)

[('Pulp Fiction', -0.08229069443023541),
 ('La Vallée', -0.2019862499651233),
 ('Boss Nigger', -0.7406162498721187)]

In [None]:
def search_for_entertainment(query, include_books, include_films):
    

In [233]:
query = input("enter query")
print("search results for query: " + query)
execute_query(query)

enter queryscience fiction novel
search results for query: science fiction novel


[('Death Star', -0.2366057390712093),
 ('Space Cadet', -0.33124803469969305),
 ("Ender's Game", -0.40866730719838584),
 ('Heart of Gold', -0.41406004337461627),
 ('The Invisible Man', -0.41406004337461627),
 ('Fiasko', -0.42587683727451814),
 ('Hunted', -0.42587683727451814),
 ('Fiasco', -0.42587683727451814),
 ('The Illuminatus! Trilogy', -0.4516908871648183),
 ('The Last Man', -0.473203044983882),
 ('Ragamuffin', -0.5205056398493433),
 ('2010: Odyssey Two', -0.5205056398493433),
 ('The Taint', -0.5520800578328217),
 ('The Shockwave Rider', -0.5520800578328217),
 ('Headlong', -0.5914890482024133),
 ('Cowl', -0.5914890482024133),
 ('Consider Phlebas', -0.5914890482024133),
 ('Vingt mille lieues sous les mers', -0.5914890482024133),
 ('Larger than Life', -0.6309373933118425),
 ('Kaleidoscope', -0.6309373933118425),
 ('Love and War', -0.6474080428711634),
 ('Stranger in a Strange Land', -0.7097947287908635),
 ('The Diamond Age', -0.7097947287908635),
 ('Now Wait for Last Year', -0.709794

In [219]:
search_index["blue"]

{'Between Planets': 1,
 'Blue Box': 2,
 'Black Box: The Complete Original Black Sabbath (1970–1978)': 1,
 'Bloody Jack': 1,
 'Blue Light': 1,
 'Blue Murder': 3,
 'Love and War in the Apennines': 1}