In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import fasttext
import fasttext.util
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, scan
from elasticsearch_dsl import Search
from elasticsearch_dsl import Q
import operator
import json
import requests
import collections

In [22]:
ft = fasttext.load_model('cc.tr.300.bin')



In [23]:
marketsales = pd.read_csv('../logstash/MarketSales.csv')
del marketsales['Unnamed: 0']
marketsales = marketsales.drop_duplicates("ITEMNAME")
marketsales = marketsales[marketsales['ITEMNAME'].notna()]
market_items = marketsales["ITEMNAME"].tolist()
market_items = [item.lower() for item in market_items]

item_vector = []
for i in range(len(market_items)):
    if "\n" in market_items[i]:
        market_items[i] = market_items[i].replace("\n", " ")
    vector = ft.get_sentence_vector(market_items[i])
    item_vector.append(list(vector))

In [35]:
marketsales_df = pd.read_csv('../logstash/MarketSales.csv')
freq_table = marketsales_df.pivot_table(columns='ITEMNAME', aggfunc='size', fill_value=0).sort_values().to_frame()
freq_table.columns = ['count']
freq_table = freq_table.reset_index()
freq_table.tail(100000)[:30]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,ITEMNAME,count
0,8*4 150 ML DEO DİSCOVERY,1
1,ICIM SMART 200ML MUZLU SUT,1
2,ICIM KASAR PEYNIRI 200GR,1
3,HUGGIES YENIBEBEK 3-6KG 50AD.,1
4,HUGGIES MIDI 4-9KG 40 AD. 2012,1
5,HUGGIES DRY NITES SMALL FOR 4_7 GECE KULODU,1
6,HOBBY WAX PARLAK VE KOLAY SEK.YAG BAZLI 100 ML,1
7,HOBBY ULTRA GUCLU KIRMIZI 100 ML,1
8,HOBBY SAMPUAN 700 ML BUGDAY OZLU,1
9,HOBBY SAC SPREYI 250 ML GUCLU TUTUS,1


## __DATA INDEXING__

In [24]:
es = Elasticsearch(['http://localhost:9200'], http_auth=('elastic', 'changeme'))

es_index = {
    "mappings": {
      "properties": {
        "product_name": {
          "type": "text"
        },
        "product_vector": {
          "type": "dense_vector",
          "dims": 300
        }
      }
    }
}

es.indices.create(index='marketsales_vectors_products', body=es_index, ignore=[400])

def getQuotes():
    for i in range(len(market_items)):
        yield {
            "_index": 'marketsales_vectors_products',
            "product_name" : market_items[i],
            "product_vector" : item_vector[i]
         }
bulk(client=es, actions = getQuotes(), request_timeout = 120)

(9303, [])

## __SEMANTIC SEARCH__

In [40]:
def semantic_search(input_query):
    query_embedding = list(ft.get_sentence_vector(input_query))

    script_query = {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'product_vector') + 1.0",
                    "params": {
                        "query_vector": query_embedding
                    }
                }
            }
        }



    client = Elasticsearch(hosts=["localhost"], http_auth=('elastic', 'changeme'))

    response = client.search(
        index='marketsales_vectors_products',
        body={
            "query": script_query
        },
        size=999
    )

    product_name = []
    search_score = []

    all_hits = response['hits']['hits']
    # all_hits
    for i in range(len(all_hits)):
        score = all_hits[i]['_score']
        result_text = all_hits[i]['_source']
        result_text = result_text['product_name']
        vector_list = all_hits[i]['_source']
        vector_list = vector_list['product_vector']
        product_name.append(result_text)
        search_score.append(score)
    #     print("Text: " + result_text + "\n" + "Score: " + str(score) + "\n" + "Vector list: " + str(vector_list[:10]) + "\n\n")

    semantic_search_result = pd.DataFrame()
    semantic_search_result['product'] = product_name
    semantic_search_result['score'] = search_score
    semantic_search_result[:30]
    
    return semantic_search_result

## __CLASSIC SEARCH__

In [41]:
def classic_search(input_query):
    client = Elasticsearch(hosts=["localhost"], http_auth=('elastic', 'changeme'))
    results = client.search(
            index='marketsales_vectors_products',
            body={
              "query": {
                "multi_match" : {
                  "query" : input_query,
                  "type": "best_fields",
                  "fields" : ["product_name"] 
                }
              }
            },
            size=999
        )

    result_hits = results['hits']['hits']
    score = result_hits[0]['_score']
    pos = str(score).find(".")
    score_first = int(str(score)[:pos])

    product_name = []
    search_score = []
    for i in range(len(result_hits)):
        score = result_hits[i]['_score'] / score_first
        result_text = result_hits[i]['_source']
        result_text = result_text['product_name']
        product_name.append(result_text)
        search_score.append(score)
    #     print("Text: " + result_text + "\n" + "Score: " + str(score) + "\n")

    classic_search_result = pd.DataFrame()
    classic_search_result['product'] = product_name
    classic_search_result['score'] = search_score
#     classic_search_result = classic_search_result.drop_duplicates('product')
    
    return classic_search_result

In [45]:
input_query = "biberiye"
semantic_search_result = semantic_search(input_query)
classic_search_result = classic_search(input_query)
print("Semantic Search Results")
print(semantic_search_result)
print('\n')
print("Classic Search Results")
print(classic_search_result)

Semantic Search Results
                                   product     score
0                     bagdat biberiye 20gr  1.702160
1              tat biberiye tursusu 370 gr  1.671417
2             tamek biberiye tursusu 370gr  1.643300
3                    berrak biberiye 720ml  1.626514
4            penguen biberiye tursu 300 ml  1.624213
..                                     ...       ...
994         penguen domates salcasi 830 gr  1.396434
995         dimes 1 lt %100 meyve suyu nar  1.396295
996  duru sivi sabun 300 ml yaban mersini   1.396262
997            kebir tereyag 250 gr. tuzlu  1.396209
998                  h.sakir samp.2in1 nar  1.396205

[999 rows x 2 columns]


Classic Search Results
                           product     score
0             bagdat biberiye 20gr  1.087660
1            berrak biberiye 720ml  1.087660
2     tamek biberiye tursusu 370gr  0.995360
3      tat biberiye tursusu 370 gr  0.917500
4    penguen biberiye tursu 300 ml  0.917500
5   berrak biberiye t