In [57]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import fasttext
import fasttext.util
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, scan
from elasticsearch_dsl import Search
from elasticsearch_dsl import Q
import operator
import json
import requests
import collections

### __READ DATA__

In [21]:
marketsales_df = pd.read_csv('/Users/onursahil/Documents/Developer/semantic_search_engine/logstash/MarketSales.csv')
marketsales_df = marketsales_df[marketsales_df['ITEMNAME'].notna()]
item_count = dict(collections.Counter(marketsales_df["ITEMNAME"].tolist()))
sorted_item_count = {k: v for k, v in sorted(item_count.items(), key=operator.itemgetter(1), reverse=True)}
del marketsales_df['Unnamed: 0']
marketsales_df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,ID,ITEMCODE,ITEMNAME,FICHENO,DATE_,AMOUNT,PRICE,LINENETTOTAL,LINENET,BRANCHNR,...,CLIENTCODE,CLIENTNAME,BRANDCODE,BRAND,CATEGORY_NAME1,CATEGORY_NAME2,CATEGORY_NAME3,STARTDATE,ENDDATE,GENDER
0,11738.0,5863.0,SPRITE 1 LT LIMON AROMALI GAZOZ,18456,2017-01-07 00:00:00,1.0,2.0,2.0,1.85,52.0,...,467369,Sercan KIZILOK,156.0,SPRİTE,İÇECEK,GAZLI İÇECEK,GAZOZ,2017-01-08 16:16:11,2017-01-08 16:17:13,E
1,10537.0,8.0,TOZ SEKER,18105,2017-01-06 00:00:00,5.0,2.65,13.25,12.27,8.0,...,131464,İsmet ŞINGIR,,,İÇECEK,ÇAY KAHVE,SEKER TATLANDIRICI,2017-01-07 11:04:34,2017-01-07 11:05:37,E
2,11335.0,5979.0,FALIM SAKIZ 5LI NANE,18350,2017-01-03 00:00:00,1.0,0.4,0.4,0.37,40.0,...,656969,Yağız KUBAL,300.0,FALIM,GIDA,SAKIZ SEKERLEME,SAKIZ,2017-01-04 14:00:03,2017-01-04 14:01:01,E
3,11336.0,5979.0,FALIM SAKIZ 5LI NANE,18350,2017-01-03 00:00:00,1.0,0.4,0.4,0.37,40.0,...,656969,Yağız KUBAL,300.0,FALIM,GIDA,SAKIZ SEKERLEME,SAKIZ,2017-01-04 14:00:03,2017-01-04 14:01:01,E
4,10115.0,12808.0,FALIM SAKIZ 5LI CILEK,18005,2017-01-05 00:00:00,1.0,0.4,0.4,0.37,41.0,...,447336,Habibe AYSAN,300.0,FALIM,GIDA,SAKIZ SEKERLEME,SAKIZ,2017-01-06 14:00:30,2017-01-06 14:01:03,K


In [174]:
freq_table = marketsales_df.pivot_table(columns='ITEMNAME', aggfunc='size', fill_value=0).sort_values().to_frame()
freq_table.columns = ['count']
freq_table = freq_table.reset_index()
freq_table[2000:3000]

Unnamed: 0,ITEMNAME,count
2000,POPIFIKS STANDART HASTABEZI 11,3
2001,EQUAL ERKEK DEO SPREY 150ML,3
2002,"WELLA TUP SAC BOYASI 4,6 KIZIL VIYOLE",3
2003,AXE 250 ML AFRICA DUS JELI,3
2004,WELLA PRO-SERIES KOPUK 250ML MAXIMUM GUCLU,3
...,...,...
2995,ELIDOR 700ML SAMP.+200 ML SAC KREMI NEMLENDIRICI,5
2996,ELIDOR S.KREMI 700 ML RENK KORUYUCU,5
2997,BAGDAT KISNIS 40GR,5
2998,LIPTON PIRAMIT YESİL CAY 20LI NANE FERAHLIGI,5


In [22]:
marketsales_df.ITEMNAME.unique()

array(['SPRITE 1 LT LIMON AROMALI GAZOZ', 'TOZ SEKER',
       'FALIM SAKIZ 5LI NANE', ..., 'ORAL-B D.F.COCUK STAGES 3',
       'WEE BABY FIRCA SETI', 'YAREN ARTCRAFT ZEN CAY TAKIMI RENKLI'],
      dtype=object)

### __VECTOR EMBEDDINGS__

#### Fasttext Turkish sentence vector embedding example

In [14]:
# Load the fasttext model
ft = fasttext.load_model('cc.tr.300.bin')
ft



<fasttext.FastText._FastText at 0x7ffaafd39a60>

In [15]:
# Example sentence vector embedding
example_embedding = ft.get_sentence_vector('sprite 1 lt limon aromali gazoz')
print("Length of the vector: ", len(example_embedding))
list(example_embedding[:5])

Length of the vector:  300


[-0.005404768, 0.044188257, -0.0042387573, 0.07476722, 0.008084468]

#### Get unique items

#### Lower case all text

#### Remove newline characters from text

#### Create sentence vector and stack them in a list

In [16]:
market_items = list(marketsales_df.ITEMNAME.unique())
market_items = [item.lower() for item in market_items]
item_vector = []
for i in range(len(market_items)):
    if "\n" in market_items[i]:
        market_items[i] = market_items[i].replace("\n", " ")
    vector = ft.get_sentence_vector(market_items[i])
    item_vector.append(list(vector))

### __ELASTICSEARCH BULK INDEXING SENTENCE VECTORS__

In [17]:
es = Elasticsearch(['http://localhost:9200'], http_auth=('elastic', 'changeme'))

es_index = {
    "mappings": {
      "properties": {
        "message_text": {
          "type": "text"
        },
        "message_vector": {
          "type": "dense_vector",
          "dims": 300
        }
      }
    }
}

es.indices.create(index='marketsales_vectors', body=es_index, ignore=[400])

def getQuotes():
    for i in range(len(market_items)):
        yield {
            "_index": 'marketsales_vectors',
            "message_text" : market_items[i],
            "message_vector" : item_vector[i]
         }
bulk(client=es, actions = getQuotes(), request_timeout = 120)

(9303, [])

### __ELASTICSEARCH SEMANTIC QUERY SEARCH USING COSINE SIMILARITY__

In [175]:
input_query = "sallama cay"
query_embedding = list(ft.get_sentence_vector(input_query))

script_query = {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'message_vector') + 1.0",
                "params": {
                    "query_vector": query_embedding
                }
            }
        }
    }



client = Elasticsearch(hosts=["localhost"], http_auth=('elastic', 'changeme'))

response = client.search(
    index='marketsales_vectors',
    body={
        "query": script_query
    },
    size=999
)

all_hits = response['hits']['hits']
# all_hits
for i in range(len(all_hits[:10])):
    score = all_hits[i]['_score']
    result_text = all_hits[i]['_source']
    result_text = result_text['message_text']
    vector_list = all_hits[i]['_source']
    vector_list = vector_list['message_vector']
    print("Text: " + result_text + "\n" + "Score: " + str(score) + "\n" + "Vector list: " + str(vector_list[:10]) + "\n\n")

Text: lipton demlik kusburnu poset cay 
Score: 1.7417022
Vector list: [-0.0311567485332489, 0.02950895205140114, -0.02347765862941742, 0.04442063719034195, 0.015275533311069012, -0.041355155408382416, 0.03210773691534996, -0.011130946688354015, -0.00044403152423910797, 0.025470400229096413]


Text: dogus karadeni̇z demlik poset cay 160 ad
Score: 1.6929257
Vector list: [-0.03983846306800842, 0.022559067234396935, -0.03428309038281441, 0.008995119482278824, 0.0007876456365920603, -0.0182204470038414, 0.007209262810647488, -0.030768129974603653, -0.025193659588694572, -0.005843133199959993]


Text: dogadan yesil cay balli 20 suzen cay
Score: 1.692445
Vector list: [-0.07033774256706238, 0.03250548988580704, -0.014419568702578545, 0.03177532181143761, 0.025982819497585297, -0.014540744945406914, -0.013237175531685352, -0.028703510761260986, 0.005484489258378744, 0.009685290046036243]


Text: lipton piramit yesi̇l cay 20li nane ferahligi
Score: 1.6818882
Vector list: [-0.030682513490319252, 

### __ELASTICSEARCH BM25 QUERY SEARCH__

In [176]:
client = Elasticsearch(hosts=["localhost"], http_auth=('elastic', 'changeme'))
results = client.search(
        index='marketsales',
        body={
          "query": {
            "multi_match" : {
              "query" : "sallama cay",
              "type": "best_fields",
              "fields" : ["FICHENO"] 
            }
          }
        },
        size=999
    )

result_hits = results['hits']['hits']
score = result_hits[0]['_score']
pos = str(score).find(".")
score_first = int(str(score)[:pos])
for i in range(len(result_hits[800:900])):
    score = result_hits[i]['_score']
    result_text = result_hits[i]['_source']
    result_text = result_text['FICHENO']
    print("Text: " + result_text + "\n" + "Score: " + str(score / score_first) + "\n")

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY
Score: 1.01638488

Text: DOGUS FILIZ CAY 1 KG +TOMURCUK CAY

In [78]:
client = Elasticsearch(['http://localhost:9200'], http_auth=('elastic', 'changeme'))
search = Search(index='marketsales', using=client)
q = Q("multi_match", query='isirgan otu', fields=["FICHENO"])
s = search.query(q)
results_df = pd.DataFrame((d.to_dict() for d in s.scan()))
# results_df = results_df.drop_duplicates(subset='FICHENO')
results_df

Unnamed: 0,UNENETTOTAL,ENDDATE,LONGITUDE,STARTDATE,CATEGORY_NAME2,BRAND,DATE_,PRICE,LATITUDE,BRANCH,...,SALESMAN,REGION,ITEMCODE,host,FICHENO,BRANCHNR,column27,ITEMNAME,CLIENTCODE,column26
0,1.0,2017-01-20 18:15:57,37.8667,,MEYVE SEBZE,A25,38481,1.0,İç Anadolu,53.0,...,Konya Subesi,Konya,96955.0,bc774aa9699f,DERE OTU,0.93,K,5493.0,32.4833,2017-01-20 18:19:20
1,1.0,2017-01-06 15:24:10,38.4189,,MEYVE SEBZE,A25,17192,1.0,Ege,41.0,...,İzmir Subesi,İzmir,6294.0,bc774aa9699f,DERE OTU,0.93,K,5493.0,27.1287,2017-01-06 15:24:44
2,1.0,2017-01-20 18:51:30,41.0053,,MEYVE SEBZE,A25,38762,1.0,Marmara,40.0,...,İstanbul Subesi,İstanbul,98699.0,bc774aa9699f,DERE OTU,0.93,K,5493.0,28.977,2017-01-20 18:53:15
3,1.0,2017-01-20 17:06:58,37.1591,,MEYVE SEBZE,A25,38733,2.0,Güneydoğu Anadolu,71.0,...,Şanlıurfa Subesi,Şanlıurfa,98413.0,bc774aa9699f,DERE OTU,1.85,K,5493.0,38.7969,2017-01-20 17:08:17
4,1.0,2017-01-20 16:21:36,39.0626,,MEYVE SEBZE,A25,38722,1.0,Doğu Anadolu,17.0,...,Bingöl Subesi,Bingöl,98354.0,bc774aa9699f,DERE OTU,0.93,K,5493.0,40.7696,2017-01-20 16:22:18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1321,1.0,2017-03-31 18:30:43,37.7765,,MEYVE SEBZE,A25,155048,1.0,Ege,25.0,...,Denizli Subesi,Denizli,598934.0,bc774aa9699f,DERE OTU,0.93,E,5493.0,29.0864,2017-03-31 18:31:10
1322,1.0,2017-03-31 11:18:31,39.7767,,MEYVE SEBZE,A25,155363,1.0,İç Anadolu,32.0,...,Eskişehir Subesi,Eskişehir,599878.0,bc774aa9699f,DERE OTU,0.93,K,5493.0,30.5206,2017-03-31 11:19:02
1323,1.0,2017-03-31 11:21:13,41.4564,,MEYVE SEBZE,A25,155364,1.0,Karadeniz,81.0,...,Zonguldak Subesi,Zonguldak,599886.0,bc774aa9699f,DERE OTU,0.93,E,5493.0,31.7987,2017-03-31 11:21:38
1324,1.0,2017-03-31 15:45:20,40.65,,MEYVE SEBZE,A25,155423,1.0,Marmara,79.0,...,Yalova Subesi,Yalova,600095.0,bc774aa9699f,DERE OTU,0.93,E,5493.0,29.2667,2017-03-31 15:46:15


In [40]:
results_df.columns

Index(['UNENETTOTAL', 'ENDDATE', 'LONGITUDE', 'STARTDATE', 'CATEGORY_NAME2',
       'BRAND', 'DATE_', 'PRICE', 'LATITUDE', 'BRANCH', 'CATEGORY_NAME3',
       'path', 'UNENET', 'CATEGORY_NAME1', 'AMOUNT', 'BRANDCODE', 'CLIENTNAME',
       'message', 'ID', '@timestamp', 'CITY', '@version', 'SALESMAN', 'REGION',
       'ITEMCODE', 'host', 'FICHENO', 'BRANCHNR', 'column27', 'ITEMNAME',
       'CLIENTCODE', 'column26'],
      dtype='object')

#### __ELASTICSEARCH BASIC / ADVANCED QUERY TYPES__

In [54]:
# Basic Query - Limited Amount of Result
query_body = {
    "query": {
        "bool": {
            "must": {
                "match": {      
                    "FICHENO": "sakiz"
                }
            }
        }
    }
}
elastic_client = Elasticsearch(hosts=["localhost"], http_auth=('elastic', 'changeme'), size='999')
result = elastic_client.search(index="marketsales", body=query_body)
all_hits = result['hits']['hits']
print(all_hits)

In [51]:
# Advanced Query - Unlimited Amount of Result - Multi_Match + Multi_Field Result
client = Elasticsearch(['http://localhost:9200'], http_auth=('elastic', 'changeme'))
search = Search(index='marketsales', using=client)
q = Q("multi_match", query='baharat', fields=["FICHENO"])
s = search.query(q)
results_df = pd.DataFrame((d.to_dict() for d in s.scan()))
results_df

#### Bert-Base-Turkish-Cased Model Sentence Vector Embedding

In [325]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
model = AutoModel.from_pretrained("dbmdz/bert-base-turkish-cased")

item_vector_bert = []
for item in market_items:
    inputs = tokenizer(item, return_tensors="pt")
    outputs = model(**inputs)
    last_hidden_states = outputs[0].tolist()
    vector = last_hidden_states[0][0]
    item_vector_bert.append(vector)

In [330]:
es = Elasticsearch(['http://localhost:9200'], http_auth=('elastic', 'changeme'))

es_index = {
    "mappings": {
      "properties": {
        "message_text": {
          "type": "text"
        },
        "message_vector": {
          "type": "dense_vector",
          "dims": 768
        }
      }
    }
}

es.indices.create(index='marketsales_vectors_bert', body=es_index, ignore=[400])

def getQuotes():
    for i in range(len(market_items)):
        yield {
            "_index": 'marketsales_vectors_bert',
            "message_text" : market_items[i],
            "message_vector" : item_vector_bert[i]
         }
bulk(client=es, actions = getQuotes(), request_timeout = 120)

(9303, [])

In [347]:
input_query = "dulasik deterjani"
inputs = tokenizer(input_query, return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs[0].tolist()
vector = last_hidden_states[0][0]

script_query = {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'message_vector') + 1.0",
                "params": {
                    "query_vector": vector
                }
            }
        }
    }

client = Elasticsearch(hosts=["localhost"], http_auth=('elastic', 'changeme'))

response = client.search(
    index='marketsales_vectors_bert',
    body={
        "query": script_query
    }
)

all_hits = response['hits']['hits']
for i in range(len(all_hits)):
    result_text = all_hits[i]['_source']
    result_text = result_text['message_text']
    vector_list = all_hits[i]['_source']
    vector_list = vector_list['message_vector']
    print(result_text, vector_list[:10], "\n")

yedigun 250 ml cam amasya ruzgari [-0.34083807468414307, 1.1891112327575684, 0.22604696452617645, 0.19006207585334778, 2.008951425552368, -0.009400218725204468, -0.523705244064331, 0.5507699847221375, 0.1187591552734375, 0.3300856351852417] 

komili islak temizlik havlusu [-0.5299723744392395, 1.0744775533676147, 0.1541150063276291, 0.09172935783863068, 2.4276766777038574, -0.05137056112289429, -0.4042316675186157, 0.5230587720870972, 0.46855705976486206, 0.16884171962738037] 

duru sivi sabun 300 ml yaban mersini  [-0.44106411933898926, 1.1739500761032104, 0.2763999104499817, -0.1496976912021637, 2.063589572906494, 0.11687879264354706, -0.5081267356872559, 0.9592832326889038, 0.25482451915740967, 0.055020250380039215] 

sera firin torbasi [-0.20961275696754456, 1.0348691940307617, 0.2875964045524597, -0.12037423253059387, 2.3430209159851074, 0.17536422610282898, -0.47047317028045654, 0.6014859080314636, 0.1761721521615982, 0.23161175847053528] 

vileda sapli super paspas [-0.353038847