In [23]:
from elasticsearch import helpers, Elasticsearch
import numpy as np
import pandas as pd


path = "C:/Users/13418/Desktop/practice_place/IR_place/wiki_movie_plots_deduped.csv"
df = pd.read_csv(path)
#print( df.head() )
print( df.shape )

# dataframe cols/features
def DFcols(df):
    cols = []
    for col in df:
        cols.append(col)
    #print( cols )
    return cols

cols = DFcols(df)
print( "columns: ", cols )

# sample of 1000 articles, randomly
num = 1000
sample = df.sample(num, random_state=6)
#print( sample.head() )
print( sample.shape )

def basicInfo(df, verbose=True):
    RY_range = list( set( df['Release Year'] ) )
    origin_range = list( set( df['Origin/Ethnicity'] ) )
    genre_range = list( set( df['Genre'] ) )
    gen = set()
    for i in genre_range:
        if '/' in i:
            tmp = i.split('/')
            for w in tmp:
                gen.add(w.strip())
        elif ',' in i:
            tmp = i.split(',')
            for w in tmp:
                gen.add(w.strip())
        else:
            gen.add(i)
    genre_range = list( gen )
    if verbose:
        print("Release Year: \t", RY_range)
        print("Origin/Ethnicity: \t", origin_range)
        print("Genre: \t", genre_range)
    return RY_range, origin_range, genre_range

print("the basic info about those movie collection:")
basicInfo(sample)
print()

(34886, 8)
columns:  ['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page', 'Plot']
(1000, 8)
the basic info about those movie collection:
Release Year: 	 [1911, 1912, 1913, 1915, 1917, 1918, 1919, 1920, 1921, 1923, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
Origin/Ethnicity: 	 ['Bengali', 'Malaysian', 'Chinese', 'Kannada', 'Punjabi', 'Marathi', 'Turkish', 'Hong Kong', 'Bollywood', 'Canadian', 'American', 'British', 'South_Korean', 'Malayalam', 'Eg

In [24]:
from datetime import datetime

# connect to Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])


# 
# indexing
# 
print( "indexing documents ..." )


# index name, doc type
my_index = 'ir_hw'
my_doc_type = 'movie'
print( "index name: ", my_index, "\t doc type: ", my_doc_type)

# the document is mapping as the following structure
mapping = {
    "properties": {
        "id":{
            "type":"long"
        },
        "Release Year": {
            "type": "keyword"
        },
        'Title':{
            "type": "text",
            "fields": {
                "field": { "type": "keyword" }                
            }
        },
        'Origin/Ethnicity':{
            "type": "text",
            "fields": {
                "field": { "type": "keyword" }                
            }
        },
        'Director': {
            "type": "text",
            "fields": {
                "field": { "type": "keyword" }                
            }
        },
        'Cast': {
            "type": "text",
            "fields": {
                "field": { "type": "keyword" }                
            }
        },
        'Genre': {
            "type": "text",
            "fields": {
                "field": { "type": "keyword" }                
            }
        },
        'Wiki Page': {
            "type": "keyword"
        },
        'Plot': {
            "type": "text",
            "fields": {
                "field": { "type": "keyword" }                
            }
        }
    }
}

# create the index with doc type, put the mapping
es.indices.delete(index=my_index, ignore=[400, 404])
es.indices.create(index=my_index, ignore=400)
es.indices.put_mapping(index=my_index, doc_type=my_doc_type, body=mapping, include_type_name = True)

# there may be some NaN values in the sample data
sample = sample.replace(np.nan, '', regex=True) # deal with NaN
# upload a sample of 1000 articles with full text to Elasticsearch
index_list = []
for ind, row in sample[:num].iterrows():
    my_doc = row.to_dict()
    es.index(index=my_index, doc_type=my_doc_type, id=ind, body=my_doc) #, ignore=400)
    index_list.append( ind )
#print( index_list[:5] )
print( "upload a sample of 1000 articles with full text to Elasticsearch" )
print()

indexing documents ...
index name:  ir_hw 	 doc type:  movie
upload a sample of 1000 articles with full text to Elasticsearch



In [25]:
print( index_list[:10] )
idx = index_list[9]
print( idx )
res = es.get(index=my_index, id=idx)
test_plot = res['_source']['Plot']
print(test_plot) 
print()

[26468, 20176, 18988, 8544, 10929, 26272, 7545, 3745, 11615, 25870]
25870
Vijay (Sunil Shetty) sole ambition is to become a collector like his father was. He is concentrating completely on his studies except for a few disturbances by Pooja (Anjali Jathar) who loves him and wants to marry him. Vijay always believed that his parents were in fact murdered and not killed in an accident, by their old faithful security officer Babu (Danny Denzongpa). Vijay starts searching for Babu, and finally when he finds him, Babu tells him that he was merely used as a pawn to cover up his parent's murder and the actual killer is roaming scot free, and only one man can lead Vijay to the killer and that is Girdhari (Lalit Tiwari) in Khandala. But before Vijay can find the truth, Girdhari is killed. Vijay and Babu start working in a casino owned by Shanti Prasad (Anupam Kher). There the local don Rana (Mohan Joshi) becomes their enemy, who is after Shanti Prasad's casino and Vijay's life. Finally when Vija

In [15]:

#
# Sentence Splitting, Tokenization and Normalization
#
print( "Sentence Splitting, Tokenization and Normalization ..." )

# remove stopwords and Tokenization 
def tokenization(es, inputText, analyzer="english"):
    #analyzer = ['english'] # stop
    res = es.indices.analyze(body={"analyzer" : analyzer,"text" : inputText})
    tokens = []
    for i in res['tokens']:
        #print(i['token'])
        tokens.append( i['token'] )
    return tokens

# Sentence Splitting
def senSplit(es, inputText, analyzer="english"):
    sen_dic = {}
    s_counter = 1
    sentence_delimiter = '. '
    sentences = inputText.split(sentence_delimiter)
    for sentence in sentences:
        sentence = tokenization(es, sentence, analyzer)
        if len(sentence) > 0:
            sen_dic[s_counter] = sentence
            s_counter += 1
    return sen_dic

# Normalization 


inputText = input("please input your query: \n")
print()
print(inputText)

Sentence Splitting, Tokenization and Normalization ...
please input your query: 
Balakrishnan (Pasupathy) is the owner of a barber shop in a remote village. His family consists of his wife Sridevi (Meena) and three children. Balakrishnan struggles to persuade customers to visit his barber shop while his competitor Shanmugham (Vadivelu), a former worker of Balakrishnan who has his own barber shop across from Balakrishnan's, uses cunning means to make business in haircutting. Balakrishnan's downfall in business causes him to neglect paying his children's school fees. Although he struggles to make a living, he still enjoys spending time with his family.

Balakrishnan (Pasupathy) is the owner of a barber shop in a remote village. His family consists of his wife Sridevi (Meena) and three children. Balakrishnan struggles to persuade customers to visit his barber shop while his competitor Shanmugham (Vadivelu), a former worker of Balakrishnan who has his own barber shop across from Balakrishn

In [16]:

#
# Selecting Keywords
#
print("Selecting Keywords ...")

# form word-set from your input text
def termSet(sen_dic):
    ws = set()
    for i in sen_dic:
        ws = ws.union( set(sen_dic[i]) )
    return ws

# calculate the term frequency for every sentence
def termFre(ws, sen):
    tf = dict.fromkeys(ws, 0)
    for i in sen:
        tf[i] = tf[i] + 1
    doc_len = len(sen)
    for i in tf:
        tf[i] = tf[i] / doc_len
    return tf

import math

# IDF, calculate the idf for every word/token
def termIDF(ws, sen_dic):
    N = len( sen_dic )
    idf = dict.fromkeys(ws, 0)
    for i in idf:
        c = 0
        for j in sen_dic:
            if i in sen_dic[j]:
                c = c + 1
                #rint(i, sen_dic[j])
        idf[i] = math.log( N/c )
        #print( i )
    return idf

# calculate the weight for every word in every document/sentence
# sen_dic, dict that includes many sentences split by inputText
def calWeight(sen_dic):
    ws = termSet(sen_dic)
    idf = termIDF(ws, sen_dic)
    weights = {}
    for i in sen_dic:
        sen = sen_dic[i]
        tf = termFre(ws, sen)
        wgt = {}
        for j in tf:
            w = tf[j] * idf[j]
            if w > 0: # only reserve the terms whose weight > 0
                wgt[j] = w
        # order by weight:
        #wgt = sorted(wgt.items(),key=lambda x:x[1],reverse=True)
        wgt = dict(sorted(wgt.items(), key=lambda item: item[1],reverse=True))
        #print(wgt)
        weights[i] = wgt        
    return weights

from collections import Counter
# select keywords
def selectKeys(weights, top=10):
    keys = set()
    for i in weights:
        c = Counter( weights[i] )
        L = len(weights[i])
        if L > top:
            L = top
        most_common = c.most_common(L)
        tmp = [key for key, val in most_common]
        keys = keys.union( set(tmp) )
    keys = list( keys )
    return keys


#
print( "processed your input text and select keys from it as follows: " )

sen_dic = senSplit(es, inputText, "stop")
"""
count = 0
for i in sen_dic:
    print( i, sen_dic[i] )
    count = count + len( sen_dic[i] )
print(count)
"""

weights = calWeight(sen_dic)
keys = selectKeys(weights)
keys = " ".join(keys)
print( keys )
print()

Selecting Keywords ...
processed your input text and select keys from it as follows: 
causes consists children visit he who former family haircutting s wife make neglect vadivelu while across uses downfall barber shop paying meena business owner remote balakrishnan his although school enjoys still three living time sridevi pasupathy him spending struggles village fees



In [17]:
#
# Stemming or Morphological Analysis
#
print( "Stemming or Morphological Analysis" )
setting2 = {
  "settings": {
    "analysis": {
      "filter": {
        "english_stop": {
          "type":       "stop",
          "stopwords":  "_english_"
        },
        "light_english_stemmer": {
          "type":       "stemmer",
          "language":   "light_english" 
        },
        "english_possessive_stemmer": {
          "type":       "stemmer",
          "language":   "possessive_english"
        }
      },
      "analyzer": {
        "english": {
          "tokenizer":  "standard",
          "filter": [
            "english_possessive_stemmer",
            "lowercase",
            "english_stop",
            "light_english_stemmer", 
            "asciifolding" 
          ]
        }
      }
    }
  }
}


# closr first, then add settings, then open
es.indices.close(index=my_index)
es.indices.put_settings(index=my_index, body=setting2 )
# es.indices.put_mapping(index=my_index, doc_type=my_doc_type, body=mapping, include_type_name = True)
es.indices.open(index=my_index)

print( "here is the used analysis filter & analyzer for following search: \t", setting2 )
print()

Stemming or Morphological Analysis
here is the used analysis filter & analyzer for following search: 	 {'settings': {'analysis': {'filter': {'english_stop': {'type': 'stop', 'stopwords': '_english_'}, 'light_english_stemmer': {'type': 'stemmer', 'language': 'light_english'}, 'english_possessive_stemmer': {'type': 'stemmer', 'language': 'possessive_english'}}, 'analyzer': {'english': {'tokenizer': 'standard', 'filter': ['english_possessive_stemmer', 'lowercase', 'english_stop', 'light_english_stemmer', 'asciifolding']}}}}}



In [19]:

#
# Searching
#
print( "Searching ..." )

queryText = keys
searchContent = [ 'Title', 'Plot', 'Director', 'Cast', 'Wiki Page']

# generate query to search, gievn input Text
def generateQuery(queryText, origin="", genre="", yearFrom=1900, yearTo=2022, searchContent=[ 'Title', 'Plot', 'Director', 'Cast', 'Wiki Page']):
    if len(queryText) == 0: # return all
        query_body = { "query":{ "match_all":{} } }
        return query_body
    
    # basic query
    query_body = { "query": {  "bool": {  "must": [ { "multi_match": { "query": queryText, "fields" : searchContent } } ],
          "filter": [ { "range": { "Release Year":{"gt":yearFrom, "lt":yearTo} }}  ] } } }
    
    # when user decide certain fields such as Origin/Ethnicity, Genre
    if len(origin) > 0:
        query_body["query"]["bool"]["filter"].append( { "match": { 'Origin/Ethnicity':  origin }} )
    if len(genre) > 0:
        query_body["query"]["bool"]["filter"].append( { "match": { 'Genre':  genre }} )
    return query_body

# search, print the top 10 recall results
def searching(es, my_index, querybody):
    result = es.search(index=my_index, body=querybody)
    recallNum = result['took']
    recallContent = result['hits']['hits']
    top = 10
    if top>recallNum:
        top = recallNum
    print( "find the top ", top, " most relevant results: \n" )
    for it in recallContent[:top]:
        print( "index: ", it['_id'], "\t relevant score: ", it['_score'] )
        content = it['_source']
        print( "Release Year: ", content['Release Year'], "\t Origin/Ethnicity: ", content['Origin/Ethnicity'] )
        print( "Genre: ",  content['Genre'] )
        print( "Title: ", content['Title'] )
        print( "Plot: ", content['Plot'][:100], "..." )
        print(  )
    return result

#print( queryText )
querybody = generateQuery(queryText)
searching(es, my_index, querybody)
print()
querybody

Searching ...
find the top  10  most relevant results: 

index:  30857 	 relevant score:  111.23846
Release Year:  2008 	 Origin/Ethnicity:  Tamil
Genre:  drama
Title:  Kuselan
Plot:  Balakrishnan (Pasupathy) is the owner of a barber shop in a remote village. His family consists of h ...

index:  27939 	 relevant score:  24.937393
Release Year:  2006 	 Origin/Ethnicity:  Malayalam
Genre:  unknown
Title:  Palunku
Plot:  Palunku tells the story of an industrious farmer Monichan and his family, which consists of his wife ...

index:  30771 	 relevant score:  23.944681
Release Year:  2007 	 Origin/Ethnicity:  Tamil
Genre:  drama
Title:  Viyabari
Plot:  Suryaprakash (S. J. Suryah) is a business man who wants to become richer than Bill Gates and in the  ...

index:  23112 	 relevant score:  23.771107
Release Year:  1979 	 Origin/Ethnicity:  Hong Kong
Genre:  unknown
Title:  The Dragon's Snake Fist
Plot:  Once upon a time there are two fighters Chu Man King and Master Wai who were from differ

{'query': {'bool': {'must': [{'multi_match': {'query': 'causes consists children visit he who former family haircutting s wife make neglect vadivelu while across uses downfall barber shop paying meena business owner remote balakrishnan his although school enjoys still three living time sridevi pasupathy him spending struggles village fees',
      'fields': ['Title', 'Plot', 'Director', 'Cast', 'Wiki Page']}}],
   'filter': [{'range': {'Release Year': {'gt': 1900, 'lt': 2022}}}]}}}

In [21]:

# 
# Engineering a Complete System
#

RY, orgin, genre = basicInfo(df, verbose=False)
yearFrom = RY[0]
yearTo = RY[-1]
origin = " ".join(orgin)

def analyzeToStr(es, text, analyzer="english"):
    re = es.indices.analyze(body={"analyzer" : analyzer, "text" : text })
    re = re['tokens']
    tmp = set()
    for it in re:
        tmp.add(it['token'])
    tmp = list(tmp)
    return " ".join(tmp)

genre = " ".join(genre)
"""

"""
genre = analyzeToStr(es, genre)
tmp = genre.split(' ')
gs = set()
for it in tmp:
    gs.add( it.strip() )
genre = " ".join(list( gs ))
#print(genre)

def process(text):
    tmp = text.split(',')
    return " ".join(tmp)

def showDetail(res):
    print("here is the detail about this article: ")
    print()
    detail = res['_source']
    for i in detail:
        print(detail[i])
    return

def retrieve(es, query, origin, genre, yearFrom, yearTo, genreBool=False):
    keys = ""
    if len(query) > 200:
        sen_dic = senSplit(es, query, "stop")
        weights = calWeight(sen_dic)
        keys = selectKeys(weights)
        keys = " ".join(keys)
    else:
        keys = analyzeToStr(es, query, analyzer="standard")
    print( keys )
    if genreBool:
        querybody = generateQuery(keys, origin, genre, yearFrom, yearTo)
    else:
        querybody = generateQuery(keys, origin=origin, yearFrom=yearFrom, yearTo=yearTo)
    
    result = searching(es, my_index, querybody)
    return 

def launch(es, query, origin, genre, yearFrom, yearTo):
    # year range
    yearFiled = input( "Do you want to set the range of Release Year? Y/N: " )
    if 'Y' in yearFiled:
        yearFrom = int( input("please enter from which year, for example: 2000. ") )
        yearTo = int( input("please enter to which year, for example: 2020. ") )
    # Origin/Ethnicity
    originBool = False
    originFiled = input( "Do you want to set certain Origin/Ethnicity? Y/N: " )
    if 'Y' in originFiled:
        origin = input("please enter which origin(s), for example: Hong Kong, American, British...")
        origin = process(origin)
        originBool = True    
    # Genre
    genreBool = False
    genreFiled = input( "Do you want to set certain Genre? Y/N: " )
    if 'Y' in genreFiled:
        genre = input("please enter which genre(s), for example: drama, action, comedy, war, romantic...")
        genre = process(genre)
        genreBool = True
    #query / inputText
    query = input( "please input your query text: " )
    print()
    print()
    #
    retrieve(es, query, origin, genre, yearFrom, yearTo, genreBool)
    
    # see detail of certain article
    detail = input( "Do you want to see the detail about the movie? Y/N: " )
    res = ""
    if 'Y' in detail:
        id_num = int( input("please enter the index of the movie(, for example: 621, 11030...): ") )
        res = es.get(index=my_index, id=id_num)
        showDetail(res)
    return


launch(es, query, origin, genre, yearFrom, yearTo)


Do you want to set the range of Release Year? Y/N: N
Do you want to set certain Origin/Ethnicity? Y/N: N
Do you want to set certain Genre? Y/N: N
please input your query text: In 1941, Joseph "Mac" McConnell, Jr. (Alan Ladd), a private in the Army medical corps near Fitchburg


army private 1941 near alan the ladd a mac joseph fitchburg medical mcconnell in jr corps
find the top  10  most relevant results: 

index:  6350 	 relevant score:  58.098392
Release Year:  1955 	 Origin/Ethnicity:  American
Genre:  biography
Title:  The McConnell Story
Plot:  In 1941, Joseph "Mac" McConnell, Jr. (Alan Ladd), a private in the Army medical corps near Fitchburg ...

index:  15668 	 relevant score:  14.1688
Release Year:  2009 	 Origin/Ethnicity:  American
Genre:  war, science fiction
Title:  Inglourious Basterds
Plot:  In 1941, SS colonel Hans Landa interrogates French dairy farmer Perrier La Padite as to the whereabo ...

index:  1402 	 relevant score:  13.308708
Release Year:  1932 	 Origin/Ethn

In [382]:
# preprocess text string by using certain analyzer
# analyzer = ['standard','simple','whitespace','stop','keyword','pattern','fingerprint']
def analyzeToStr(es, text, analyzer="stop"):
    re = es.indices.analyze(body={"analyzer" : analyzer, "text" : text })
    re = re['tokens']
    tmp = set()
    for it in re:
        tmp.add(it['token'])
    tmp = list(tmp)
    return " ".join(tmp)

print( analyzeToStr(es, test_plot, "standard") )

def tokenization1(inputText):
    analyzer = ['stop'] # stop
    text = [inputText]
    for analyze in analyzer:
        res = es.indices.analyze(body={"analyzer" : analyze,"text" : text})
        #print("======",analyze,"========")
        tokens = []
        for i in res['tokens']:
            #print(i['token'])
            tokens.append( i['token'] )
        #print( txt1 )
        #print("\n")
    return " ".join(tokens)


print( ind_list[:10] )
idx = ind_list[9]
print( idx )
res = es.get(index=my_index, id=idx)
test_plot = res['_source']['Plot']
print(test_plot) 
print()


yearFrom = 1960
yearTo = 2018
queryText = "British Hong Kong Steve Cheng, Victor Tam, Herman Yau"
queryText = "University of Central Arkansas student Chloe Steele"
origin = ["British", "Hong Kong"]
origin = "British Hong Kong"
genre = "unknown comedy"

# https://blog.csdn.net/u013429010/article/details/81746179
# https://www.cnblogs.com/wangkun122/articles/10736507.html
# https://blog.csdn.net/laoyang360/article/details/80468757
# https://blog.csdn.net/y472360651/article/details/76652021
# https://www.cnblogs.com/xiao987334176/p/10130712.html#autoid-1-5-1

#searching(es, my_index, query_body9)

query_body9 = {
  "query": { 
    "bool": { 
      "must": [
        { "multi_match": { "query": queryText, "fields" : searchContent } }  
      ],
      "filter": [ 
        { "match": { 'Origin/Ethnicity':  origin }},
        { "match": { 'Genre':  genre }},
        { "range": { "Release Year":{"gt":yearFrom, "lt":yearTo} }} 
      ]
    }
  }
}

def create_data_packet(s_counter, sentence):
    return {"sentence_id": s_counter, "sentence_text": sentence.replace('\n', '')}
        
analyzer = ['standard','simple','whitespace','stop','keyword','pattern','fingerprint']
text = ["HELLO WORLD. Today is the 2nd day of the week!!!!     it is Monday."]
text = ["HELLO today is A GREAT DAY"]

# res = es.delete_by_query(index=my_index, body=query_body)
#print(res)
# https://blog.csdn.net/u013487601/article/details/103262667

# https://github.com/AmoghM/ElasticSearch-in-Python/blob/master/Chapter9-Analyser.ipynb
# https://zhuanlan.zhihu.com/p/43072517
text1 = "HELLO today is A GREAT DAY"
text2 = "The quick fox jumped and the lazy dog kept snoring"
text3 = "buses, busses, simplicity, cuteness"
es.indices.analyze(body={
  "analyzer" : "english",
  "text" : "Eating an apple a day keeps doctor away"
})

{'tokens': [{'token': 'eat',
   'start_offset': 0,
   'end_offset': 6,
   'type': '<ALPHANUM>',
   'position': 0},
  {'token': 'appl',
   'start_offset': 10,
   'end_offset': 15,
   'type': '<ALPHANUM>',
   'position': 2},
  {'token': 'dai',
   'start_offset': 18,
   'end_offset': 21,
   'type': '<ALPHANUM>',
   'position': 4},
  {'token': 'keep',
   'start_offset': 22,
   'end_offset': 27,
   'type': '<ALPHANUM>',
   'position': 5},
  {'token': 'doctor',
   'start_offset': 28,
   'end_offset': 34,
   'type': '<ALPHANUM>',
   'position': 6},
  {'token': 'awai',
   'start_offset': 35,
   'end_offset': 39,
   'type': '<ALPHANUM>',
   'position': 7}]}