In [10]:
import whoosh as wh
from whoosh import fields
from whoosh import index
from whoosh import query
import pandas as pd
import os

In [1]:
#! pip install whoosh

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 1.5 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=794579 sha256=40f8b2c8367b666c4943080531e7d93cefca26b1e2a30897c52cf16baeb6605c
  Stored in directory: /home/ignacio/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


## https://whoosh.readthedocs.io/en/latest/quickstart.html

## Ejemplo  Whoosh
### https://whoosh.readthedocs.io/en/latest/intro.html

In [20]:
df_movies_genres = pd.read_csv('./data/movie_genres.csv')
df_movies_genres["genre"].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical',
       'Film-Noir', 'Western', 'Short'], dtype=object)

In [15]:
#os.mkdir('index_dir')

In [16]:
# Se genera la bas de datos para la busqueda
schema = wh.fields.Schema(
    id=wh.fields.ID(stored=True),
    genres=wh.fields.KEYWORD(stored=True, lowercase=True, commas=True)
)

In [17]:
# Lo que necesito ahora es indexar, que se va a generar en index_dir
ix = wh.index.create_in("index_dir", schema)
writer = ix.writer()

for mid in df_movies_genres.movieID.unique():    
    writer.add_document(id=str(mid), genres=u",".join(df_movies_genres.loc[df_movies_genres["movieID"]==mid, "genre"].tolist()))

# Aveces hay que hacer commit y volver a correr todo
writer.commit()

In [18]:
searcher = ix.searcher()

# Busca los que son de action y drama. | lo usa para OR
query = wh.query.Term("genres", "action") & wh.query.Term("genres", "drama")
results  = searcher.search(query, limit=None)
for r in results:
    print(r)

'}>
<Hit {'genres': 'Action,Drama,Thriller,Western', 'id': '3681'}>
<Hit {'genres': 'Action,Crime,Drama,Thriller', 'id': '3682'}>
<Hit {'genres': 'Action,Comedy,Crime,Drama', 'id': '3716'}>
<Hit {'genres': 'Action,Crime,Drama,Thriller', 'id': '3729'}>
<Hit {'genres': 'Action,Drama,War', 'id': '3753'}>
<Hit {'genres': 'Action,Crime,Drama,Thriller', 'id': '3761'}>
<Hit {'genres': 'Action,Adventure,Drama', 'id': '3805'}>
<Hit {'genres': 'Action,Adventure,Drama', 'id': '3907'}>
<Hit {'genres': 'Action,Drama,Thriller', 'id': '3946'}>
<Hit {'genres': 'Action,Crime,Drama,Thriller', 'id': '3947'}>
<Hit {'genres': 'Action,Adventure,Drama,Fantasy', 'id': '3996'}>
<Hit {'genres': 'Action,Drama,War,Western', 'id': '4042'}>
<Hit {'genres': 'Action,Crime,Drama', 'id': '4064'}>
<Hit {'genres': 'Action,Crime,Drama', 'id': '4065'}>
<Hit {'genres': 'Action,Comedy,Crime,Drama', 'id': '4085'}>
<Hit {'genres': 'Action,Adventure,Drama,Thriller', 'id': '4142'}>
<Hit {'genres': 'Action,Drama', 'id': '4163'}>


In [40]:
PERFIL = {
    "action": 0.1,
    "comedy": 0.9,
}

searcher = ix.searcher()
query = wh.query.Term("genres", "action") | wh.query.Term("genres", "comedy")
results  = searcher.search(query, limit=None, terms=True)
for r in results:
    score = 0
    for _, g in r.matched_terms():
        score += PERFIL[g.decode('utf-8')]
    print(r, score)

'genres': 'Comedy,Drama', 'id': '50583'}> 0.9
<Hit {'genres': 'Comedy,Drama,Romance', 'id': '50685'}> 0.9
<Hit {'genres': 'Comedy,Drama,Romance', 'id': '50792'}> 0.9
<Hit {'genres': 'Action,Crime,Drama,Thriller', 'id': '50794'}> 0.1
<Hit {'genres': 'Adventure,Comedy', 'id': '50798'}> 0.9
<Hit {'genres': 'Comedy,Drama,Romance', 'id': '50802'}> 0.9
<Hit {'genres': 'Comedy,Romance', 'id': '50806'}> 0.9
<Hit {'genres': 'Comedy,Drama', 'id': '50842'}> 0.9
<Hit {'genres': 'Animation,Children,Comedy,Fantasy', 'id': '50872'}> 0.9
<Hit {'genres': 'Comedy', 'id': '50970'}> 0.9
<Hit {'genres': 'Action,Adventure,Drama,War', 'id': '51007'}> 0.1
<Hit {'genres': 'Action,Fantasy,Thriller', 'id': '51077'}> 0.1
<Hit {'genres': 'Comedy,Romance', 'id': '51082'}> 0.9
<Hit {'genres': 'Comedy,Romance', 'id': '51084'}> 0.9
<Hit {'genres': 'Comedy', 'id': '51088'}> 0.9
<Hit {'genres': 'Comedy,Drama,Romance', 'id': '51094'}> 0.9
<Hit {'genres': 'Action,Adventure', 'id': '51277'}> 0.1
<Hit {'genres': 'Action,Sci