In [None]:
!pip install elasticsearch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import json
from tqdm import tqdm
import re
from copy import deepcopy



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
with open("/content/drive/MyDrive/IR_Spring2022/Copy of IR_data_news_12k.json" , 'r') as f:
    data = json.load(f)

In [None]:
data_tmp = []
for i in tqdm(range(len(data))):
    doc = data[f'{i}']
    content = {
        "content_title":doc['content']
    }
    title = {
        "content_title":doc['title']
    }
    data_tmp.append(content)
    data_tmp.append(title)
    

100%|██████████| 12202/12202 [00:00<00:00, 368681.55it/s]


In [None]:
def load_data_to_elastic(es,data,index_name):
    data_bulk = [
        {
            "_index" : index_name,
            "_id" : i + 1,
            "_source": data[i]
        }
        for i in range(len(data))
    ]
    resp = helpers.bulk(
        es,
        data_bulk,
        index = index_name,request_timeout=30
    )
    print(resp)

***Section One***

In [None]:
sc_mapping = { 
    "settings": {
    "index": {
      "analysis": {
        "char_filter": {
            "zero_width_spaces": {
                "type":       "mapping",
                "mappings": [ "\\u200C=>\\u0020"] 
            }
        },
        "analyzer": {
          "trigram": {
            "type": "custom",
            "tokenizer": "standard",
            "char_filter": [ "zero_width_spaces" ],
            "filter": ["shingle"]
          }
        },
        "filter": {
          "shingle": {
            "type": "shingle",
            "min_shingle_size": 2,
            "max_shingle_size": 3
          }
        }
      }
    }
  },
    "mappings": {
        "properties": {
            "content_title": {
            "type": "text",
            "fields": {
                  "trigram": {
                    "type": "text",
                    "analyzer": "trigram"
                  }
                }
            }
        }
    }
}

In [None]:
# Name of index 
sc_index_name = 'ir_test_sc'

In [None]:
es = Elasticsearch("http://localhost:9200")

In [None]:
es

<Elasticsearch(['http://localhost:9200'])>

In [None]:
# Delete index if one does exist
if es.indices.exists(index=sc_index_name):
    es.indices.delete(index=sc_index_name)

# Create index
es.indices.create(index=sc_index_name, body=sc_mapping)

In [None]:
load_data_to_elastic(es,data_tmp,sc_index_name)

In [None]:
def get_suggestions(text , index_name):
    resp = es.search(index=index_name,suggest={
        "text": text,
        "simple_phrase": {
            "phrase": {
                "smoothing" : {
                    "laplace" : {
                        "alpha" : 0.5
                    }
                },
                "field": "content_title.trigram",
                "size": 5,
                "confidence": 1,
                "real_word_error_likelihood": 0.95,
                "max_errors" : 3,
                "direct_generator": [ 
                    {
                        "min_word_length":3,
                        "field": "content_title",
                        "suggest_mode": "always",
                        "prefix_length":2
                    } 
                ],
                "collate": {
                    "query": { 
                        "source" : {
                            "match_phrase": {
                            "{{field_name}}" : "{{suggestion}}" 
                            }
                        }
                    },
                    "params": {"field_name" : "content_title"}, 
                    "prune": True 
                }
            }
        }
    },size=0)
    return dict(resp)

In [None]:
texts = [
    "لیک برتر فوطبال",
    "تورنومنت شش جانبه",
    "طبعیض نژادی",
    "اردوی طیم امیذ",
    "جام ملب های آشیا",
    "کنره سیاسی آمریکا",
    "انغلاب اشلامی ایران",
    "فدراصیون فوتبال بلژیک",
    "لایهه مجلص خبرگان",
    "نحبگان دانشگاهی",
    "نمایند مجل",
    "فضاسازی کازب",
    "صلاح ایرادات شورای تگهبان",
    "تهریف های تاریخی",
    "قلیت های دینی",
    "تدارکان لازم",
]

In [None]:
for text in texts:
    print(get_suggestions(text,sc_index_name )['suggest'])
    print("========================")

***Section 2***

In [None]:
sc_mapping_v1 = { 
    "settings": {
    "index": {
      "analysis": {
        "char_filter": {
            "zero_width_spaces": {
                "type":       "mapping",
                "mappings": [ "\\u200C=>\\u0020"] 
            }
        },
        "analyzer": {
          "trigram": {
            "type": "custom",
            "tokenizer": "standard",
            "char_filter": [ "zero_width_spaces" ],
            "filter": ["shingle"]
          },
        "reverse": {
            "type": "custom",
            "tokenizer": "standard",
            "char_filter": [ "zero_width_spaces"],
            "filter": ["reverse"]
          }
        },
        "filter": {
          "shingle": {
            "type": "shingle",
            "min_shingle_size": 2,
            "max_shingle_size": 3
          }
        }
      }
    }
  },
    "mappings": {
        "properties": {
            "content_title": {
            "type": "text",
            "fields": {
                  "trigram": {
                    "type": "text",
                    "analyzer": "trigram"
                  },
                  "reverse": {
                    "type": "text",
                    "analyzer": "reverse"
                  }
                }
            }
        }
    }
}

In [None]:
# Name of index 
sc_index_name_v1 = 'ir_test_sc_v1'

In [None]:
# Delete index if one does exist
if es.indices.exists(index=sc_index_name_v1):
    es.indices.delete(index=sc_index_name_v1)

# Create index
es.indices.create(index=sc_index_name_v1, body=sc_mapping_v1)

In [None]:
load_data_to_elastic(es,data_tmp,sc_index_name_v1)

In [None]:
def get_suggestions_v1(text , index_name):
    resp = es.search(index=index_name,suggest={
        "text": text,
        "simple_phrase": {
            "phrase": {
                "smoothing" : {
                    "laplace" : {
                        "alpha" : 0.7
                    }
                },
                "field": "content_title.trigram",
                "size": 5,
                "confidence": 1,
                "real_word_error_likelihood": 0.95,
                "max_errors" : 3,
                "direct_generator": [ 
                    {
                        "min_word_length":3,
                        "field": "content_title.trigram",
                        "suggest_mode": "always",
                        "min_word_length":3,
                        "prefix_length":1
                    },
                    {
                      "field" : "content_title.reverse",
                      "suggest_mode" : "popular",
                      "pre_filter" : "reverse",
                      "post_filter" : "reverse",
                      "min_word_length":3,
                      "prefix_length":1
                    }
                ],
                "collate": {
                    "query": { 
                        "source" : {
                            "match_phrase": {
                            "{{field_name}}" : "{{suggestion}}" 
                            }
                        }
                    },
                    "params": {"field_name" : "content_title"}, 
                    "prune": True 
                }
            }
        }
    },size=0)
    return dict(resp)

In [None]:
for text in texts:
    print(get_suggestions_v1(text,sc_index_name_v1 )['suggest']['simple_phrase'])
    print("========================")

***Section Three***

In [None]:
sc_mapping_v2 = { 
    "settings": {
    "index": {
      "analysis": {
        "char_filter": {
            "zero_width_spaces": {
                "type":       "mapping",
                "mappings": [ "\\u200C=>\\u0020"] 
            }
        },
        "analyzer": {
          "trigram": {
            "type": "custom",
            "tokenizer": "standard",
            "char_filter": [ "zero_width_spaces" ],
            "filter": ["decimal_digit","shingle"]
          },
        "reverse": {
            "type": "custom",
            "tokenizer": "standard",
            "char_filter": [ "zero_width_spaces"],
            "filter": ["decimal_digit","reverse"]
          },
        "synonym": {
            "tokenizer": "whitespace",
            "filter": [ "synonym" ]
         }
        },
        "filter": {
          "shingle": {
            "type": "shingle",
            "min_shingle_size": 2,
            "max_shingle_size": 3
          },
            "synonym": {
            "type": "synonym",
            "synonyms_path": "analysis/synonyms.txt"
          }
        }
      }
    }
  },
    "mappings": {
        "properties": {
            "content_title": {
            "type": "text",
            "fields": {
                  "trigram": {
                    "type": "text",
                    "analyzer": "trigram"
                  },
                  "reverse": {
                    "type": "text",
                    "analyzer": "reverse"
                  }
                }
            }
        }
    }
}

In [None]:
sc_index_name_v2 = 'ir_test_sc_v2'

In [None]:
# Delete index if one does exist
if es.indices.exists(index=sc_index_name_v2):
    es.indices.delete(index=sc_index_name_v2)

# Create index
es.indices.create(index=sc_index_name_v2, body=sc_mapping_v2)

In [None]:
load_data_to_elastic(es,data_tmp,sc_index_name_v2)

In [None]:
def get_suggestions_v2(text , index_name):
    resp = es.search(index=index_name,suggest={
        "text": text,
        "simple_phrase": {
            "phrase": {
                "smoothing" : {
                    "laplace" : {
                        "alpha" : 0.7
                    }
                },
                "field": "content_title.trigram",
                "size": 20,
                "confidence": 1,
                "real_word_error_likelihood": 0.95,
                "max_errors" : 3,
                "direct_generator": [ 
                    {
                        "field": "content_title.trigram",
                        "suggest_mode": "always",
                        "min_word_length":3,
                        "prefix_length":1,
                        
                    },
                    {
                      "field" : "content_title.reverse",
                      "suggest_mode" : "always",
                      "pre_filter" : "reverse",
                      "post_filter" : "reverse",
                      "min_word_length":3,
                      "prefix_length":1
                    },
                    {
                        "field" : "content_title",
                        "suggest_mode":"popular",
                        "post_filter":"synonym",
                        "prefix_length":3,
                        
                    }
                ]
            }
        }
    },size=0)
    return dict(resp)

In [None]:
for text in texts:
    print(get_suggestions_v2(text,sc_index_name_v2 )['suggest']['simple_phrase'])
    print("====================================")