In [None]:
from google.colab import drive


In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!rm -rf /content/drive

In [None]:
drive.flush_and_unmount()


Drive not mounted, so nothing to flush and unmount.


### pre process 
using base line code to process pricipal data 

In [None]:
!pip install parsivar
!pip install git+https://github.com/RoboEpics/roboepics-client.git

Collecting parsivar
  Downloading parsivar-0.2.3.tar.gz (36.2 MB)
[K     |████████████████████████████████| 36.2 MB 52 kB/s 
[?25hCollecting nltk==3.4.5
  Downloading nltk-3.4.5.zip (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 17.8 MB/s 
Building wheels for collected packages: parsivar, nltk
  Building wheel for parsivar (setup.py) ... [?25l[?25hdone
  Created wheel for parsivar: filename=parsivar-0.2.3-py3-none-any.whl size=36492971 sha256=4f5cdfb5bd7108cf9c20b9adff54e1dc26627b505dc6a27eee36c8427953ca3a
  Stored in directory: /root/.cache/pip/wheels/ae/67/7a/49cbf08f64d3f76a26eceaf0e481a40e233f05d4356875cbed
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.4.5-py3-none-any.whl size=1449923 sha256=a85071938ec609de045411987c81531cd33f0362d8bc8725e53fbc90b419cd10
  Stored in directory: /root/.cache/pip/wheels/48/8b/7f/473521e0c731c6566d631b281f323842bbda9bd819eb9a3ead
Successfully built parsivar nltk
Installing collecte

In [None]:
from roboepics_client.roboepics_client import RoboEpicsClient

problem_id = 4
problem_enter_id = None  # fill this value with your id

# roboepics_client = RoboEpicsClient(problem_id, problem_enter_id)

In [None]:
!gdown --id 1uYwzBe8nLhOQ2Q3rCScEXvljLkQqJrHc
!7z x data.7z
!ls


Downloading...
From: https://drive.google.com/uc?id=1uYwzBe8nLhOQ2Q3rCScEXvljLkQqJrHc
To: /content/data.7z
295MB [00:01, 176MB/s]

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,4 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 295109903 bytes (282 MiB)

Extracting archive: data.7z
--
Path = data.7z
Type = 7z
Physical Size = 295109903
Headers Size = 309
Method = LZMA2:24
Solid = +
Blocks = 2

  0%      0% 1 - data/base_products.json                                  1% 1 - data/base_products.json                                  2% 1 - data/base_products.json                                  3% 1 - data/base_products

In [None]:
from __future__ import unicode_literals

import collections
import gc
import json
import re
import os

import numpy as np
import pandas as pd
import parsivar
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [None]:
def read_json(path, n_lines_to_read=None):
    """
    read json file line by line iteratively (generator function).
    use this function to read json files when you have memory limitations.
    """

    with open(path) as f:
        for i, line in enumerate(tqdm(f)):
            if n_lines_to_read == i:
                break
            yield json.loads(line)

In [None]:
parsivar_normalizer = parsivar.Normalizer(statistical_space_correction=True)

char_mappings = {
    "٥": "5",
    "А": "a",
    "В": "b",
    "Е": "e",
    "Н": "h",
    "Р": "P",
    "С": "C",
    "Т": "T",
    "а": "a",
    "г": "r",
    "е": "e",
    "к": "k",
    "м": "m",
    "о": "o",
    "р": "p",
    "ڈ": "د",
    "ڇ": "چ",
    # Persian numbers (will be raplaced by english one)
    "۰": "0",
    "۱": "1",
    "۲": "2",
    "۳": "3",
    "۴": "4",
    "۵": "5",
    "۶": "6",
    "۷": "7",
    "۸": "8",
    "۹": "9",
    ".": ".",
    # Arabic numbers (will be raplaced by english one)
    "٠": "0",
    "١": "1",
    "٢": "2",
    "٣": "3",
    "٤": "4",
    "٥": "5",
    "٦": "6",
    "٧": "7",
    "٨": "8",
    "٩": "9",
    # Special Arabic Characters (will be replaced by persian one)
    "ك": "ک",
    "ى": "ی",
    "ي": "ی",
    "ؤ": "و",
    "ئ": "ی",
    "إ": "ا",
    "أ": "ا",
    "آ": "ا",
    "ة": "ه",
    "ء": "ی",
    # French alphabet (will be raplaced by english one)
    "à": "a",
    "ä": "a",
    "ç": "c",
    "é": "e",
    "è": "e",
    "ê": "e",
    "ë": "e",
    "î": "i",
    "ï": "i",
    "ô": "o",
    "ù": "u",
    "û": "u",
    "ü": "u",
    # Camma (will be replace by dots for floating point numbers)
    ",": ".",
    # And (will be replace by dots for floating point numbers)
    "&": " and ",
    # Vowels (will be removed)
    "ّ": "",  # tashdid
    "َ": "",  # a
    "ِ": "",  # e
    "ُ": "",  # o
    "ـ": "",  # tatvil
    # Spaces
    "‍": "",  # 0x9E -> ZERO WIDTH JOINER
    "‌": " ",  # 0x9D -> ZERO WIDTH NON-JOINER
    # Arabic Presentation Forms-A (will be replaced by persian one)
    "ﭐ": "ا",
    "ﭑ": "ا",
    "ﭖ": "پ",
    "ﭗ": "پ",
    "ﭘ": "پ",
    "ﭙ": "پ",
    "ﭞ": "ت",
    "ﭟ": "ت",
    "ﭠ": "ت",
    "ﭡ": "ت",
    "ﭺ": "چ",
    "ﭻ": "چ",
    "ﭼ": "چ",
    "ﭽ": "چ",
    "ﮊ": "ژ",
    "ﮋ": "ژ",
    "ﮎ": "ک",
    "ﮏ": "ک",
    "ﮐ": "ک",
    "ﮑ": "ک",
    "ﮒ": "گ",
    "ﮓ": "گ",
    "ﮔ": "گ",
    "ﮕ": "گ",
    "ﮤ": "ه",
    "ﮥ": "ه",
    "ﮦ": "ه",
    "ﮪ": "ه",
    "ﮫ": "ه",
    "ﮬ": "ه",
    "ﮭ": "ه",
    "ﮮ": "ی",
    "ﮯ": "ی",
    "ﮰ": "ی",
    "ﮱ": "ی",
    "ﯼ": "ی",
    "ﯽ": "ی",
    "ﯾ": "ی",
    "ﯿ": "ی",
    # Arabic Presentation Forms-B (will be removed)
    "ﹰ": "",
    "ﹱ": "",
    "ﹲ": "",
    "ﹳ": "",
    "ﹴ": "",
    "﹵": "",
    "ﹶ": "",
    "ﹷ": "",
    "ﹸ": "",
    "ﹹ": "",
    "ﹺ": "",
    "ﹻ": "",
    "ﹼ": "",
    "ﹽ": "",
    "ﹾ": "",
    "ﹿ": "",
    # Arabic Presentation Forms-B (will be replaced by persian one)
    "ﺀ": "ی",
    "ﺁ": "ا",
    "ﺂ": "ا",
    "ﺃ": "ا",
    "ﺄ": "ا",
    "ﺅ": "و",
    "ﺆ": "و",
    "ﺇ": "ا",
    "ﺈ": "ا",
    "ﺉ": "ی",
    "ﺊ": "ی",
    "ﺋ": "ی",
    "ﺌ": "ی",
    "ﺍ": "ا",
    "ﺎ": "ا",
    "ﺏ": "ب",
    "ﺐ": "ب",
    "ﺑ": "ب",
    "ﺒ": "ب",
    "ﺓ": "ه",
    "ﺔ": "ه",
    "ﺕ": "ت",
    "ﺖ": "ت",
    "ﺗ": "ت",
    "ﺘ": "ت",
    "ﺙ": "ث",
    "ﺚ": "ث",
    "ﺛ": "ث",
    "ﺜ": "ث",
    "ﺝ": "ج",
    "ﺞ": "ج",
    "ﺟ": "ج",
    "ﺠ": "ج",
    "ﺡ": "ح",
    "ﺢ": "ح",
    "ﺣ": "ح",
    "ﺤ": "ح",
    "ﺥ": "خ",
    "ﺦ": "خ",
    "ﺧ": "خ",
    "ﺨ": "خ",
    "ﺩ": "د",
    "ﺪ": "د",
    "ﺫ": "ذ",
    "ﺬ": "ذ",
    "ﺭ": "ر",
    "ﺮ": "ر",
    "ﺯ": "ز",
    "ﺰ": "ز",
    "ﺱ": "س",
    "ﺲ": "س",
    "ﺳ": "س",
    "ﺴ": "س",
    "ﺵ": "ش",
    "ﺶ": "ش",
    "ﺷ": "ش",
    "ﺸ": "ش",
    "ﺹ": "ص",
    "ﺺ": "ص",
    "ﺻ": "ص",
    "ﺼ": "ص",
    "ﺽ": "ض",
    "ﺾ": "ض",
    "ﺿ": "ض",
    "ﻀ": "ض",
    "ﻁ": "ط",
    "ﻂ": "ط",
    "ﻃ": "ط",
    "ﻄ": "ط",
    "ﻅ": "ظ",
    "ﻆ": "ظ",
    "ﻇ": "ظ",
    "ﻈ": "ظ",
    "ﻉ": "ع",
    "ﻊ": "ع",
    "ﻋ": "ع",
    "ﻌ": "ع",
    "ﻍ": "غ",
    "ﻎ": "غ",
    "ﻏ": "غ",
    "ﻐ": "غ",
    "ﻑ": "ف",
    "ﻒ": "ف",
    "ﻓ": "ف",
    "ﻔ": "ف",
    "ﻕ": "ق",
    "ﻖ": "ق",
    "ﻗ": "ق",
    "ﻘ": "ق",
    "ﻙ": "ک",
    "ﻚ": "ک",
    "ﻛ": "ک",
    "ﻜ": "ک",
    "ﻝ": "ل",
    "ﻞ": "ل",
    "ﻟ": "ل",
    "ﻠ": "ل",
    "ﻡ": "م",
    "ﻢ": "م",
    "ﻣ": "م",
    "ﻤ": "م",
    "ﻥ": "ن",
    "ﻦ": "ن",
    "ﻧ": "ن",
    "ﻨ": "ن",
    "ﻩ": "ه",
    "ﻪ": "ه",
    "ﻫ": "ه",
    "ﻬ": "ه",
    "ﻭ": "و",
    "ﻮ": "و",
    "ﻯ": "ی",
    "ﻰ": "ی",
    "ﻱ": "ی",
    "ﻲ": "ی",
    "ﻳ": "ی",
    "ﻴ": "ی",
    "ﻵ": "لا",
    "ﻶ": "لا",
    "ﻷ": "لا",
    "ﻸ": "لا",
    "ﻹ": "لا",
    "ﻺ": "لا",
    "ﻻ": "لا",
    "ﻼ": "لا",
}

valid_chars = [
    " ",
    #################
    "0",
    "1",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "8",
    "9",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
    "آ",
    "ئ",
    "ا",
    "ب",
    "ت",
    "ث",
    "ج",
    "ح",
    "خ",
    "د",
    "ذ",
    "ر",
    "ز",
    "س",
    "ش",
    "ص",
    "ض",
    "ط",
    "ظ",
    "ع",
    "غ",
    "ف",
    "ق",
    "ل",
    "م",
    "ن",
    "ه",
    "و",
    "پ",
    "چ",
    "ژ",
    "ک",
    "گ",
    "ی",
]


def _replace_rep(t):
    "Replace repetitions at the character level: ccc -> c"

    def __replace_rep(m):
        c, cc = m.groups()
        return f"{c}"

    re_rep = re.compile(r"(\S)(\1{2,})")
    return re_rep.sub(__replace_rep, t)


def _replace_wrep(t):
    "Replace word repetitions: word word word -> word"

    def __replace_wrep(m):
        c, cc = m.groups()
        return f"{c}"

    re_wrep = re.compile(r"(\b\w+\W+)(\1{2,})")
    return re_wrep.sub(__replace_wrep, t)


def _normalize_text(x):
    """normalize a sentence"""

    x = str(x)
    x = parsivar_normalizer.normalize(x)  # apply `parsivar` normalizations
    x = re.sub(r"[\u200c\r\n]", " ", x)  # remove half space and new line characters
    x = x.lower()
    x = "".join(
        [char_mappings[xx] if xx in char_mappings else xx for xx in x]
    )  # substitue bad characters with appropriate ones
    x = re.sub(
        r"[^{}]".format("".join(valid_chars)), " ", x
    )  # just keep valid characters and substitue others with space
    x = re.sub(r"[a-z]+", r" \g<0> ", x)  # put space around words and numbers
    x = re.sub(r"[0-9]+", r" \g<0> ", x)  # put space around words and numbers
    x = re.sub(r"\s+", " ", x)  # remove more than one white spaces with space
  #  x = _replace_rep(x) ################################################################################################I change this one
    x = _replace_wrep(x)
    return x.strip()


def normalize_texts(X, use_tqdm=False):
    """normalize list of sentences"""

    if use_tqdm:
        X = [_normalize_text(x) for x in tqdm(X)]
    else:
        X = [_normalize_text(x) for x in X]
    return X

In [None]:
class JSONListWriter:
    """
    auxilary class to write list of dictionaries into json file.
    each item in one line.
    """

    def __init__(self, file_path):
        self.fd = None
        self.file_path = file_path
        self.delimiter = "\n"

    def open(self):
        self.fd = open(self.file_path, "w")
        self.first_item_written = False
        return self

    def close(self):
        self.fd.close()
        self.fd = None

    def write_item(self, obj):
        if self.first_item_written:
            self.fd.write(self.delimiter)
        self.fd.write(json.dumps(obj))
        self.first_item_written = True

    def __enter__(self):
        return self.open()

    def __exit__(self, type, value, traceback):
        self.close()

In [None]:
# setting paths for inputs
data_folder = "./data"

products_path = os.path.join(data_folder, "base_products.json")
products_normalized_path = os.path.join(data_folder, "base_products_normalized.json")

search_log_train_path = os.path.join(data_folder, "search_log_train.json")
click_log_train_path = os.path.join(data_folder, "click_log_train.json")

queries_test_offline_path = os.path.join(data_folder, "queries_test_offline.json")
queries_test_offline_normalized_path = os.path.join(
    data_folder, "queries_test_offline_normalized.json"
)

# paths for aggregated data to be maid
search_clicks_file_path = os.path.join(
        data_folder, f"searches_clicks_joined_train.json"
    )

search_click_merged_path = os.path.join(data_folder, f"searches_merged_train.json")

In [None]:
def make_base_product_names(products_path: str, products_normalized_path: str):
    """
    assign each base product a name.
    it is extracted from sellers of that product.
    """

    with JSONListWriter(products_normalized_path) as file:
        for product in read_json(products_path):
            pr_name = ""
            for seller in product["sellers"]:
                pr_name += " " + seller["name1"] + " " + seller["name2"]
            words = [w.strip() for w in pr_name.split()]
            words = set(
                [w for w in words if w != ""]
            )  # create a set of all words from seller product names
            pr_name = (" ".join(words)).strip()

            if (
                pr_name == ""
            ):  # exclude this product if its name is an empty string (does not have sellers)
                continue

            product["product_name"] = pr_name
            product["product_name_normalized"] = _normalize_text(pr_name)

            file.write_item(product)


def aggregate_clicks(search_path, click_path, tag, valid_base_ids):
    """aggregate clicks on each search record and injects it into the search record"""

    search_clicks_dict = {}
    for i, click_row in enumerate(
        read_json(click_path)
    ):  # aggregate clicks on search_id
        search_id = click_row["search_log_id"]
        base_product_id = click_row["base_product_id"]

        list_of_clicks = search_clicks_dict.get(search_id, [])
        list_of_clicks.append(base_product_id)
        search_clicks_dict[search_id] = list_of_clicks

    invalid_results, invalid_clicks, invalid_searches = 0, 0, 0
    
    with JSONListWriter(
        search_clicks_file_path
    ) as file:  # write the result in a new file
        for i, search_row in enumerate(read_json(search_path)):
            search_id = search_row["_id"]
            search_results = search_row["result"]

            results = [
                r for r in search_results if r in valid_base_ids
            ]  # omit results that are not valid products
            results_set = set(results)

            clicks = search_clicks_dict.get(search_id, [])
            clicks = [
                c for c in clicks if c in results_set
            ]  # omit clicks on invalid products

            invalid_results += len(search_results) - len(results)
            invalid_clicks += len(search_clicks_dict.get(search_id, [])) - len(clicks)

            if len(clicks) == 0:
                invalid_searches += 1
                continue

            search_row["raw_query"] = search_row["raw_query"].strip()
            search_row["raw_query_normalized"] = _normalize_text(
                search_row["raw_query"]
            )  # store the normalized raw_query
            search_row["result"] = results
            search_row["clicks"] = clicks

            file.write_item(search_row)

    print(
        f"invalid searches: {invalid_searches}, "
        + f"invalid results: {invalid_results}, "
        + f"invalid clicks: {invalid_clicks}"
    )


def aggregate_searches(tag):
    "aggregates searches based on raw query."

    search_clicks_path = os.path.join(data_folder, f"searches_clicks_joined_{tag}.json")
    groups = {}
    normalized_query_mapping = {}
    # aggregate searchs on raw_query
    # following counters are creatd for each aggregated search
    # results counter: shows how many times each product is showed to user when the raw_query is searched
    # clicks counter: shows how many times each product is clicked when the raw_query is searched
    # pages counter: shows how many times each page is viewed by user when the raw_query is searched
    for i, search in enumerate(read_json(search_clicks_path)):
        raw_query = search["raw_query"]
        normalized_query_mapping[raw_query] = search["raw_query_normalized"]

        counters = groups.get(raw_query, {})
        groups[raw_query] = counters

        counters.setdefault("results", collections.Counter())
        counters.setdefault("pages", collections.Counter())
        counters.setdefault("clicks", collections.Counter())

        counters["results"].update(search["result"])
        counters["pages"].update([search["page"]])
        counters["clicks"].update(search["clicks"])

    new_df = []
    for raw_query, counters in tqdm(groups.items()):
        results_counter = counters["results"].most_common()  # sort based on views
        pages_counter = counters["pages"].most_common()  # sort based on views
        clicks_counter = counters["clicks"].most_common()  # sort based on clicks

        new_df.append(
            {
                "raw_query": raw_query,
                "raw_query_normalized": normalized_query_mapping[raw_query],
                "results": [k for k, v in results_counter],
                "result_counts": [v for k, v in results_counter],
                "pages": [k for k, v in pages_counter],
                "page_counts": [v for k, v in pages_counter],
                "clicks": [k for k, v in clicks_counter],
                "click_counts": [v for k, v in clicks_counter],
            }
        )
    print("Number of unique queries after merge:", len(new_df))

    pd.DataFrame(new_df).to_json(
        search_click_merged_path,
        orient="records",
        lines=True,
    )


def normalize_test_queries(queries_test_path, queries_test_normalized_path):
    """normalize test queries"""
    with JSONListWriter(queries_test_normalized_path) as file:
        for query in read_json(queries_test_path):
            normalized_query = _normalize_text(query)
            file.write_item(normalized_query)

In [None]:
make_base_product_names(products_path, products_normalized_path)

print("\nProduct names created and saved in:", products_normalized_path)

1985152it [22:14, 1487.30it/s]


Product names created and saved in: ./data/base_products_normalized.json





In [None]:
valid_base_ids = set(
    [product["_id"] for product in read_json(products_normalized_path)]
)
print("\nList of valid products created")

1769291it [00:36, 49002.78it/s]



List of valid products created


In [None]:
normalize_test_queries(queries_test_offline_path, queries_test_offline_normalized_path)
print("Test queries are normalized")

38724it [00:12, 3146.71it/s]

Test queries are normalized





In [None]:
%reset-f

-----------------------------------------------------------------------

### make new features from pervious datas

In [None]:
import matplotlib.pyplot as plt
import itertools
from datetime import datetime
import pandas as pd
import numpy as np
import json
import re
import warnings
import gc 
from tqdm import tqdm, trange

In [None]:
#read data and makeing dataframes
def read_lowmem(path):
  with open(path) as f:
    res = [json.loads(line) for line in tqdm(f)]
    res = pd.DataFrame(res)
  return res 

In [None]:
 
base_products_normalized = read_lowmem('/content/drive/MyDrive/data_days/base_products_normalized.json')
searches_clicks_joined_train = read_lowmem('/content/drive/MyDrive/data_days/searches_clicks_joined_train.json')
queries_test_offline_normalized = read_lowmem("/content/drive/MyDrive/data_days/queries_test_offline_normalized.json")
queries_test_offline = read_lowmem("/content/drive/MyDrive/data_days/queries_test_offline.json")
#search = read_lowmem('/content/data/search_log_train.json') 


In [None]:
searches_merged_train =read_lowmem("/content/drive/MyDrive/data_days/searches_merged_train.json")

116621it [00:06, 16938.73it/s]


In [None]:
queries_test_offline_normalized = read_lowmem("/content/drive/MyDrive/data_days/queries_test_offline_normalized.json")
queries_test_offline = read_lowmem("/content/drive/MyDrive/data_days/queries_test_offline.json")
base_products_normalized = read_lowmem('/content/drive/MyDrive/data_days/base_products_normalized.json')



38724it [00:00, 265221.35it/s]
38724it [00:00, 252077.33it/s]
1769291it [00:51, 34106.35it/s]


In [None]:
click =read_lowmem("/content/drive/MyDrive/data_days/click_log_train.json")

In [None]:
#read data and makeing dataframes
def read_lowmem(path):
  with open(path) as f:
    res = [json.loads(line) for line in tqdm(f)]
    res = pd.DataFrame(res)
  return res  
base_products_normalized = read_lowmem('/content/data/base_products_normalized.json')
click = read_lowmem('/content/data/click_log_train.json') 
queries_test_offline_normalized = read_lowmem("/content/data/queries_test_offline_normalized.json")
queries_test_offline = read_lowmem("/content/data/queries_test_offline.json")
#search = read_lowmem('/content/data/search_log_train.json') 


1769291it [00:46, 37848.60it/s]
6317934it [00:29, 211050.06it/s]
38724it [00:00, 250448.68it/s]
38724it [00:00, 244561.63it/s]


In [None]:
search_log_train= read_lowmem('/content/drive/MyDrive/data_days/search_log_train.json')

3694579it [00:48, 76408.68it/s] 


In [None]:
click= read_lowmem('/content/drive/MyDrive/data_days/click_log_train.json')

6317934it [00:35, 177211.45it/s]


In [None]:
#extraction of number of clicks
#click.drop_duplicates(subset=['base_product_id' , 'search_log_id'] ,keep='last' , inplace=True) #Eliminate extra rows
df_nclick = pd.DataFrame(click['base_product_id'].value_counts()) 
del click
gc.collect()
df_nclick .head()

In [None]:
df_nclick .head()

Unnamed: 0,base_product_id
mmmzj,25125
mmtqd,21470
mmmbo,20736
mmmbi,20017
mmmbw,19833


In [None]:
base_products_normalized.head()

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,رزولوشن HD142X#دیتا HD##Video 1920x1080 3D، مد...,رزولوشن hd 142 x دیتا hd video 1920 x 1080 3 d...
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",گرم##ژل Lash (ESSENCE And ESSENCE لیفت اصلی#ژل...,گرم ژل lash essence and essence لیفت اصلی ژل m...
2,yovgc,مام و اسپری,"[{'name1': 'استیک مردانه وی ایت رکسونا V8', 'n...",40 رکسونا#Rexona #استیک گرم##دئودورانت وی مدل ...,40 رکسونا rexona استیک گرم دیودورانت وی مدل مر...
3,yokff,مام و اسپری,[{'name1': 'مام صابونی رکسونا زنانه کاتون درای...,Algodon##استیک 40 Dry##استیک الگودون rexona (R...,algodon استیک 40 dry استیک الگودون rexona rexo...
4,uduyq,گوشی هوآوی,"[{'name1': 'Huawei Ascend Y550', 'name2': 'Hua...",Y550##گوشی مدل اسند موبایل Ascend Y550#Huawei ...,y 550 گوشی مدل اسند موبایل ascend y 550 huawei...


###delete extera words
delete extera words in each product name nomalized

In [None]:
def eli_dup(string):
  s = ' '
  my_list = string.split()
  my_final_list = set(my_list)
  lst = list(my_final_list)
  return  s.join(lst)

In [None]:
base_products_normalized['product_name_normalized'] = base_products_normalized['product_name_normalized'].apply(eli_dup)
base_products_normalized['product_name'] = base_products_normalized['product_name'].apply(eli_dup)



```
# This is formatted as code
```

### sort by number of click and category name
sort base_products_normalized by number of clicks and category name

In [None]:
###########################################################################################

In [None]:
base_products_normalized['click_number'] = base_products_normalized['_id'].apply(lambda x: df_nclick['base_product_id'].loc[x] if x in df_nclick.index else 0)
base_products_normalized.sort_values(by=["category_name","click_number" ] ,ascending=False, inplace=True)
base_products_normalized.reset_index(inplace = True)

del df_nclick
gc.collect()

--------------------------------------------------------------------

### define necessery functions


In [None]:
# correction function used for correct bad query serches
#ab c   d --> ab c d
#si9 تن9 --> si 9 9 تن
#and delete Effectless words in invalid words
invalid_words = ["خرید", "دانلود",' ' ,"برای","/" , "\\" , "ا","ض","ص","ث","ق","ف","غ","ع","ه","خ","ح","ج","چ","پ","ش","س","ی","ب","ل","ت","ن","م","ک","گ",";","'" , "ظ","ط","ز","ر","ذ","د","ئ","و","."]
invalid_words = set(invalid_words)
invalid_words = list(invalid_words)

def split(word):
    return [char for char in word]


def correction (text):
  text = text.split(" ")
  invalid_words_lenght = len(invalid_words)
  c_text = []
  k = 0
  for j in text :
    k = 0
    for i in range(invalid_words_lenght) :
      if invalid_words[i] == j or j== '' :
        k=1
        pass
        
      elif i == invalid_words_lenght-1 and k==0:
        
        c_text.append(j)
        
  text = " ".join(c_text)
 
  
  text = split(text) 
  lenght_text = len(text)
  for i in range(lenght_text):
    if i == lenght_text-1 :
      break
    if text[i] in "aqzwsxedcrfvtgbyhnujmikolp1234567890" and text[i+1] in "ضشظصسطثیزقبرفلذغادعتئهنخمحکجچپگکو" :
      text.insert( i+1 , " ")
    if text[i] in "ضشظصسطثیزقبرفلذغادعتئهنخمحکجچپگکو1234567890" and text[i+1] in "aqzwsxedcrfvtgbyhnujmikolp" :
      text.insert( i+1 , " ")
  text ="".join(text)


  return text

---------------------------------------------

##apply correction function to query serach normalized

In [None]:
#      tedad bishtar ghalat emlayii 

In [None]:
queries_test_offline.head(10)

Unnamed: 0,0
0,تلویزیون 40 ایتچ
1,قهوه سازDelongi
2,بند فلزی mi band 5
3,قالی زمردمشهد
4,پوکوx3 pro
5,Xiaomi redmi note 9s
6,4000d
7,لباس آنا
8,سنگ چشم ببر
9,گلگیر جلو پژو ۴۰۵ شرکتی


In [None]:
base_products_normalized.head(3)

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,1920x1080 HD مدل سینمای ویدئو DLP اصلی Video خ...,1920 x 1080 hd مدل سینمای ویدیو dlp اصلی video...
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",حجم کننده (essence) Mascara) ژل ESSENCE and MA...,حجم کننده essence mascara ژل essence and masca...
2,yovgc,مام و اسپری,"[{'name1': 'استیک مردانه وی ایت رکسونا V8', 'n...",رکسونا دئودرانت Men Deodorant دئودورانت مدل گر...,رکسونا دیودرانت men deodorant دیودورانت مدل گر...


In [None]:
base_products_normalized.head()

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,1920x1080 HD مدل سینمای ویدئو DLP اصلی Video خ...,1920 x 1080 hd مدل سینمای ویدیو dlp اصلی video...
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",حجم کننده (essence) Mascara) ژل ESSENCE and MA...,حجم کننده essence mascara ژل essence and masca...
2,yovgc,مام و اسپری,"[{'name1': 'استیک مردانه وی ایت رکسونا V8', 'n...",رکسونا دئودرانت Men Deodorant دئودورانت مدل گر...,رکسونا دیودرانت men deodorant دیودورانت مدل گر...
3,yokff,مام و اسپری,[{'name1': 'مام صابونی رکسونا زنانه کاتون درای...,حجم رکسونا دئودرانت 40ml Deodorant رول Dry مدل...,حجم رکسونا دیودرانت 40 ml deodorant رول dry مد...
4,uduyq,گوشی هوآوی,"[{'name1': 'Huawei Ascend Y550', 'name2': 'Hua...",موبایل Ascend Huawei گوشی اسند HUAWEI ASCEND م...,موبایل ascend huawei گوشی اسند huawei ascend م...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
searches_clicks_joined_train.head(2)

Unnamed: 0,_id,raw_query,page,result,datetime,raw_query_normalized,clicks
0,609d7e3a6c2e88f6b241b089,گوشی موبایل,3,"[mmmmm, mmmmy, mmmmu, mmmmk, mmmmb, mmmma, mmm...",2021-05-13T19:30:02.664000,گوشی موبایل,"[mmmmb, mmmmb, mmmmb]"
1,609d7e4802589dff17211ce4,گوشی,38,"[mmmmn, mmmmi, mmmmo, mmmmx, mmmms, mmmmw, mmm...",2021-05-13T19:30:16.244000,گوشی,[mmmmw]


In [None]:
searches_merged_train.head(2)

Unnamed: 0,raw_query,raw_query_normalized,results,result_counts,pages,page_counts,clicks,click_counts
0,گوشی موبایل,گوشی موبایل,"[mmmbf, mmmmq, mmmbn, mmmbi, mmmbg, mmmbo, mmm...","[1783, 1765, 1753, 1731, 1725, 1662, 1607, 148...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 12,...","[2230, 1060, 721, 525, 388, 269, 199, 142, 119...","[mmmbf, mmmbn, mmmbg, mmmbi, mmmmq, mmmbc, mmm...","[441, 369, 314, 286, 272, 245, 243, 229, 185, ..."
1,گوشی,گوشی,"[mmmbf, mmmmq, mmmbn, mmmbi, mmmbg, mmmbo, mmm...","[10878, 10827, 10770, 10716, 10710, 10608, 103...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","[12352, 5683, 3823, 2653, 2070, 1291, 1039, 79...","[mmmbf, mmmbn, mmmmq, mmmbi, mmmbg, mmmbo, mmm...","[3184, 2749, 1819, 1727, 1709, 1485, 1336, 128..."


In [None]:
base_products_normalized.head()

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,1920x1080 HD مدل سینمای ویدئو DLP اصلی Video خ...,1920 x 1080 hd مدل سینمای ویدیو dlp اصلی video...
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",حجم کننده (essence) Mascara) ژل ESSENCE and MA...,حجم کننده essence mascara ژل essence and masca...
2,yovgc,مام و اسپری,"[{'name1': 'استیک مردانه وی ایت رکسونا V8', 'n...",رکسونا دئودرانت Men Deodorant دئودورانت مدل گر...,رکسونا دیودرانت men deodorant دیودورانت مدل گر...
3,yokff,مام و اسپری,[{'name1': 'مام صابونی رکسونا زنانه کاتون درای...,حجم رکسونا دئودرانت 40ml Deodorant رول Dry مدل...,حجم رکسونا دیودرانت 40 ml deodorant رول dry مد...
4,uduyq,گوشی هوآوی,"[{'name1': 'Huawei Ascend Y550', 'name2': 'Hua...",موبایل Ascend Huawei گوشی اسند HUAWEI ASCEND م...,موبایل ascend huawei گوشی اسند huawei ascend م...


In [None]:
base_products_normalized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1769291 entries, 0 to 1769290
Data columns (total 5 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   _id                      object
 1   category_name            object
 2   sellers                  object
 3   product_name             object
 4   product_name_normalized  object
dtypes: object(5)
memory usage: 67.5+ MB


In [None]:
base_products_normalized['category_name'].value_counts()

کتاب و مجلات                               73853
کیف و کاور گوشی و تبلت                     53176
ساعت مچی عقربه‌ ای و دیجیتالی              39647
سایر لوازم یدکی خودرو                      34473
سایر ابزار دستی و تجهیزات کارگاهی          32575
                                           ...  
پرینتر پاناسونیک                               1
کامپیوتر دسکتاپ تارکس                          1
سیستم‌های نظارتی و امنیتی و لوازم جانبی        1
دکوراسیون منزل                                 1
مسیریاب خودرو                                  1
Name: category_name, Length: 1554, dtype: int64

In [None]:
Category = pd.DataFrame(base_products_normalized['category_name'].value_counts() )

In [None]:
Category.reset_index(inplace=True)

In [None]:
base_products_normalized.head()

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,1920x1080 HD مدل سینمای ویدئو DLP اصلی Video خ...,1920 x 1080 hd مدل سینمای ویدیو dlp اصلی video...
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",حجم کننده (essence) Mascara) ژل ESSENCE and MA...,حجم کننده essence mascara ژل essence and masca...
2,yovgc,مام و اسپری,"[{'name1': 'استیک مردانه وی ایت رکسونا V8', 'n...",رکسونا دئودرانت Men Deodorant دئودورانت مدل گر...,رکسونا دیودرانت men deodorant دیودورانت مدل گر...
3,yokff,مام و اسپری,[{'name1': 'مام صابونی رکسونا زنانه کاتون درای...,حجم رکسونا دئودرانت 40ml Deodorant رول Dry مدل...,حجم رکسونا دیودرانت 40 ml deodorant رول dry مد...
4,uduyq,گوشی هوآوی,"[{'name1': 'Huawei Ascend Y550', 'name2': 'Hua...",موبایل Ascend Huawei گوشی اسند HUAWEI ASCEND م...,موبایل ascend huawei گوشی اسند huawei ascend م...


###method 1

In [None]:
dontknow =  []
for i in tqdm(Category['index'])  :
  df  = base_products_normalized[base_products_normalized['category_name'] == i]
  c = ""

  for j in df['product_name_normalized']:
    c+= j
  dontknow.append(c)




100%|██████████| 1554/1554 [04:04<00:00,  6.35it/s]


In [None]:
dontknow[0]

'باستان کتاب دانستنی از هایی ایرانپژوهشی و کتاب منظ د ازا سنت باغ سازی در پاریقل کتاب کدو زن قلیختم کتاب الجنان سوره ان منتخب با همراه مفاتیحکتاب بی از های جونز قلنبه جونی مهمانی مجموعه بالشهای کتاب سفری اسمان کامل بر کن نیایش مجموعهتعقیب گریز و کتاب از مدرسه جاسوسی جنوب در مجموعهو کتاب نوجوان تربیت بهترین برای از تغذیه کودک کودکان کلیدهای مجموعه نوجوانانکتاب ها اربابنقاب کتاب از نقره ای مجیستریوم مجموعهصلاح جات کتاب امنه اهایکتاب سنگین استقلال جنگ چچن بهایمخملک جنازه اشپزشعبده باز فیل اقایاسلام ایینهاین دوچرخه جنون داردمادرید کتاب ریال راهکریم کتاب قرانو کتاب حلزون از سوفی مجموعهکتاب چه سوی شمایل گوارا انکتاب دریایی مصور دزدان دانشنامهکتاب قلعه حیواناتترسناک کتاب واقعیت کسی دارن را از تر چه شانی کتاب جن از قورباغه ربایی زده ارمنته مجموعهدارویی راهنمای کتاب پزشکی راهنمای خا خانوادهکتاب روایتی کیمیا دختر سرگذشت بر دوست خواندهکتاب جادو کوهدایره کتاب ارتباط نخستین بدن انسان المعارف بدایره کتاب با ارتباط جانوران نخستین المعارفکتاب اهن قراضه نان مجموعه خشک دمپایی کهنهکتاب 3 های کاردستی اوری

In [None]:
len(dontknow)

1554

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
invalid_words = ["خرید", "دانلود",' ' ,"برای","/" , "\\" , "ا","ض","ص","ث","ق","ف","غ","ع","ه","خ","ح","ج","چ","پ","ش","س","ی","ب","ل","ت","ن","م","ک","گ",";","'" , "ظ","ط","ز","ر","ذ","د","ئ","و","."]


In [None]:
vectorizer = TfidfVectorizer(lowercase=True, use_idf=True ,stop_words= invalid_words )

In [None]:
vectorizer.fit(dontknow)

In [None]:
queries_test_offline_normalized.head()

Unnamed: 0,0
0,تلویزیون 40 ایتچ
1,قهوه ساز delongi
2,بند فلزی mi band 5
3,قالی زمردمشهد
4,پوکو x 3 pro


In [None]:
responce = vectorizer.transform(['پوکو x 3 pro' ] )
for col in responce.nonzero()[1]:
    print (feature_names[col], ' - ', responce[0, col])

پوکو  -  0.9460017872024833
pro  -  0.32416140826709655


In [None]:
responce = vectorizer.transform(['بند فلزی mi band 5'] )
for col in responce.nonzero()[1]:
    print (feature_names[col], ' - ', responce[0, col])

فلزی  -  0.3711638743627793
بند  -  0.4525721914225984
mi  -  0.48579221642179754
band  -  0.6491700180869071


In [None]:
responce = vectorizer.transform(['قهوه ساز delongi' ] )
for col in responce.nonzero()[1]:
    print (feature_names[col], ' - ', responce[0, col])

قهوه  -  0.27923514063934785
ساز  -  0.3104619548700001
delongi  -  0.9086479575778629


In [None]:
search_click_merge_2.iloc[247:249]

Unnamed: 0,raw_query_normalized,r_c_merged,cl_c_merged,tf-idf_priority
247,5 3,"[(mjsnl, 20), (mmbdb, 20), (mmanm, 17), (mmugi...","[(mmbdb, 23), (mmyib, 9), (mmbrs, 2), (mmanm, ...",[100]
248,5 g,"[(mmmfl, 941), (mmmmp, 940), (mmytg, 926), (mm...","[(mmmau, 576), (mmmmp, 373), (mmmfl, 307), (mm...",[]


In [None]:
responce = vectorizer2.transform(['lug piop']  , )
for col in responce.nonzero()[1]:
    print (feature_names[col], ' - ', responce[0, col])

lug  -  1.0


###method 2

In [None]:
dontknow2 =  []
for i in tqdm(base_products_normalized['product_name_normalized'])  :
  dontknow2.append(i)




100%|██████████| 1769291/1769291 [00:03<00:00, 544854.57it/s] 


In [None]:
vectorizer2 = TfidfVectorizer(lowercase=True, use_idf=True ,stop_words= invalid_words )
vectorizer2.fit(dontknow2)
feature_names = vectorizer2.get_feature_names()


In [None]:
responce = vectorizer2.transform(['پوکو x 3 pro' ] )
for col in responce.nonzero()[1]:
    print (feature_names[col], ' - ', responce[0, col])

پوکو  -  0.8899441920423984
pro  -  0.45606944103941327


In [None]:
responce = vectorizer2.transform(['بند فلزی mi band 5'] )
for col in responce.nonzero()[1]:
    print (feature_names[col], ' - ', responce[0, col])

فلزی  -  0.459435878265007
بند  -  0.4592482489749901
mi  -  0.5012877094937298
band  -  0.5715945695042921


In [None]:
responce = vectorizer2.transform(['قهوه ساز delongi' ] )
for col in responce.nonzero()[1]:
    print (feature_names[col], ' - ', responce[0, col])

قهوه  -  0.37157890150997963
ساز  -  0.3882606417335175
delongi  -  0.8433165443849148


###save datas in workspace to free memory


In [None]:
base_products_normalized.to_csv("base_products_normalized.csv")
queries_test_offline_normalized.to_csv("queries_test_offline_normalized.csv")


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
!cp queries_test_offline_normalized.csv /content/drive/MyDrive
!cp base_products_normalized.csv /content/drive/MyDrive

cp: cannot create regular file '/content/drive/MyDrive': No such file or directory
cp: cannot create regular file '/content/drive/MyDrive': No such file or directory


In [None]:
%reset -f # reset notebook due to memory limitations


Don't know how to reset  #, please run `%reset?` for details
Don't know how to reset  reset, please run `%reset?` for details
Don't know how to reset  notebook, please run `%reset?` for details
Don't know how to reset  due, please run `%reset?` for details
Don't know how to reset  to, please run `%reset?` for details
Don't know how to reset  memory, please run `%reset?` for details
Don't know how to reset  limitations, please run `%reset?` for details


In [None]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import json

In [None]:
base_products_normalized = pd.read_csv("/content/base_products_normalized.csv" )
base_products_normalized.drop(["index" ,"Unnamed: 0" ] , axis=1 ,inplace=True)
base_products_normalized.head(2)

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized,click_number
0,mguae,یخچال، فریزر و تاپینگ فروشگاهی,"[{'name1': 'فریزر صندوقی R.B.C دو در', 'name2'...",در فریزر صندوقی دو R.B.C,r در b فریزر صندوقی c دو,68
1,mujip,یخچال، فریزر و تاپینگ فروشگاهی,"[{'name1': 'یخچال ایستکول TM-9580-HS', 'name2'...",ایستکول سفید دار- مدل TM9580-CS فوت هتلی ‎ فوت...,دار ایستکول سفید tm مدل فوت هتلی hs eastcool ف...,58


###Dictionry
make a dictionry by using words in product_name_normalized

In [None]:
base_products_normalized.head()

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized,price,date_added
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,پروژکتور دیتا HD142X ویدئو 3000 3D، Video opto...,پروژکتور دیتا video 3000 3 dlp optoma 142 خانگ...,31852600,1528652000.0
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",brow Brow مژه (essence) BROW گرم و اورجینال ma...,brow مژه گرم و اورجینال mascara کننده miss and...,60000,1593461000.0
2,yovgc,مام و اسپری,"[{'name1': 'استیک مردانه وی ایت رکسونا V8', 'n...",وی REXONA گرم تعریق ایت Rexona دئودرانت Tuning...,وی deodorant گرم تعریق ایت men دیودرانت وزن مد...,0,1588267000.0
3,yokff,مام و اسپری,[{'name1': 'مام صابونی رکسونا زنانه کاتون درای...,میلی Women کاتن کاتون لیتر گرم (Rexona تعریق م...,میلی deodorant کاتن کاتون لیتر گرم تعریق ml ما...,49000,1586876000.0
4,uduyq,گوشی هوآوی,"[{'name1': 'Huawei Ascend Y550', 'name2': 'Hua...",هوآوی اسند ASCEND Huawei Ascend موبایل Y550 گو...,اسند y موبایل ascend گوشی مدل huawei g وای هوا...,0,1493062000.0


In [None]:
df = base_products_normalized.copy()

In [None]:
df['list'] = df['product_name_normalized'].apply(lambda x:x.split(" "))
df.head()

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized,price,date_added,list
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,پروژکتور دیتا HD142X ویدئو 3000 3D، Video opto...,پروژکتور دیتا video 3000 3 dlp optoma 142 خانگ...,31852600,1528652000.0,"[پروژکتور, دیتا, video, 3000, 3, dlp, optoma, ..."
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",brow Brow مژه (essence) BROW گرم و اورجینال ma...,brow مژه گرم و اورجینال mascara کننده miss and...,60000,1593461000.0,"[brow, مژه, گرم, و, اورجینال, mascara, کننده, ..."
2,yovgc,مام و اسپری,"[{'name1': 'استیک مردانه وی ایت رکسونا V8', 'n...",وی REXONA گرم تعریق ایت Rexona دئودرانت Tuning...,وی deodorant گرم تعریق ایت men دیودرانت وزن مد...,0,1588267000.0,"[وی, deodorant, گرم, تعریق, ایت, men, دیودرانت..."
3,yokff,مام و اسپری,[{'name1': 'مام صابونی رکسونا زنانه کاتون درای...,میلی Women کاتن کاتون لیتر گرم (Rexona تعریق م...,میلی deodorant کاتن کاتون لیتر گرم تعریق ml ما...,49000,1586876000.0,"[میلی, deodorant, کاتن, کاتون, لیتر, گرم, تعری..."
4,uduyq,گوشی هوآوی,"[{'name1': 'Huawei Ascend Y550', 'name2': 'Hua...",هوآوی اسند ASCEND Huawei Ascend موبایل Y550 گو...,اسند y موبایل ascend گوشی مدل huawei g وای هوا...,0,1493062000.0,"[اسند, y, موبایل, ascend, گوشی, مدل, huawei, g..."


In [None]:
df.iloc[4 , 4]

'اسند y موبایل ascend گوشی مدل huawei g وای هواوی 4 550'

In [None]:
df1 = df[['category_name', 'list']].copy()


In [None]:
df1.iloc[4 , 1]

['اسند',
 'y',
 'موبایل',
 'ascend',
 'گوشی',
 'مدل',
 'huawei',
 'g',
 'وای',
 'هواوی',
 '4',
 '550']

In [None]:
df1.head()

Unnamed: 0,category_name,list
0,ویدیو پروژکتور اپتما,"[پروژکتور, دیتا, video, 3000, 3, dlp, optoma, ..."
1,تقویت کننده مژه و ابرو,"[brow, مژه, گرم, و, اورجینال, mascara, کننده, ..."
2,مام و اسپری,"[وی, deodorant, گرم, تعریق, ایت, men, دیودرانت..."
3,مام و اسپری,"[میلی, deodorant, کاتن, کاتون, لیتر, گرم, تعری..."
4,گوشی هوآوی,"[اسند, y, موبایل, ascend, گوشی, مدل, huawei, g..."


In [None]:
#Extention of search to make faster process
from itertools import chain
df_extended = pd.DataFrame({
    'category_name':df1['category_name'].values.repeat(df1['list'].str.len()),
    'list':list(chain.from_iterable(df1['list'].tolist()))
})

In [None]:
df_extended[df_extended['list'] == 'اینچ']

Unnamed: 0,category_name,list
765,سایر ماشین‌های اداری,اینچ
1784,تلویزیون سامسونگ,اینچ
1953,تلویزیون جی پلاس,اینچ
2725,سایر تلویزیون و لوازم جانبی,اینچ
6894,تلویزیون جی پلاس,اینچ
...,...,...
15286538,تلویزیون جی پلاس,اینچ
15286755,نمایشگر سامسونگ,اینچ
15286800,نمایشگر سامسونگ,اینچ
15287149,سایر ابزار دستی و تجهیزات کارگاهی,اینچ


In [None]:
lst  = list(df_extended["list"])
lst.sort()
lst = list(set(lst))
lst.sort()

In [None]:
lst[0:5]

['0', '00', '000', '0000', '00000']

In [None]:
dic = {'ا': [],
       'ب': [],
       'پ': [],
       'ت': [],
       'ث': [],
       'ج': [],
       'چ': [],
       'ح': [],
       'خ': [],
       'د': [],
       'ذ': [],
       'ر': [],
       'ز': [],
       'ژ': [],
       'س': [],
       'ش': [],
       'ص': [],
       'ض': [],
       'ط': [],
       'ظ': [],
       'ع': [], 
       'غ': [],
       'ف': [],
       'ق': [],
       'ک': [],
       'گ': [],
       'ل': [],
       'م': [],
       'ن': [],
       'و': [],
       'ه': [],
       'ی': [],
       'آ' : [],
       'a': [],
       'b' :[] ,
       'c': [],
       'd': [],
       'e': [],
       'f': [],
       'g': [],
       'h': [],
       'i': [],
       'j': [],
       'k': [],
       'l': [],
       'm': [],
       'n': [],
       'o': [],
       'p': [],
       'q': [],
       'r': [],
       's': [],
       't': [],
       'u': [],
       'v': [],
       'w': [], 
       'x': [],
       'y': [],
       'z': [],
       '1': [],
       '2': [],
       '3': [],
       '4': [],
       '5': [],
       '6': [],
       '7': [],
       '8': [],
       '9': [],
       '0':[],
       '.':[],
       'else':[]}

In [None]:
df_lst = pd.DataFrame({'product' : lst})
df_lst.head()

Unnamed: 0,product
0,0
1,0
2,0
3,0
4,0


In [None]:
for i in (df_lst['product']):
  for j in list(dic.keys()):
    if (i[0] == j):
      dic[j].append(i)
      break
    if j == 'else':
      dic['else'].append(i)

In [None]:
sum = 0
for i in dic.keys():
  sum = sum + len(dic[i])
sum   # it must be 278902         

282233

In [None]:
del sum , df  , df1 , df_lst
gc.collect()

325

####matching
this function used for find proper words that is most similar to query search

###algorithm
in this sectoin we made our algorithm

frist of all we made a dataframe 
dataframe contain category_name 
and all of words that exist in category

In [None]:
df_listed_name = pd.DataFrame(df_extended.groupby('category_name')['list'].apply(list))

In [None]:
df_listed_name.head(3)

Unnamed: 0_level_0,list
category_name,Unnamed: 1_level_1
آب انارگیری,"[negin, دستی, اب, انار, گیری, مدل, نگین, گیر, ..."
آب سردکن و تصفیه آب و لوازم جانبی,"[تصفیه, فیلتر, اب, سطلی, سوبو, a, کوچک, 928, ت..."
آب مرکبات‌گیری,"[tefal, press, مرکبات, اب, پرتقال, zp, citrus,..."


In [None]:
del df_extended
gc.collect()

208

now we count how much each word repeat in df_listed_name

In [None]:
from collections import Counter
def con (x):
  x = Counter(x)
  x = dict(x)
  return x


In [None]:
df_listed_name['dic'] =  df_listed_name['list'].apply(con)


In [None]:
df_listed_name.reset_index(inplace=True)

In [None]:
df_listed_name.head()

Unnamed: 0,category_name,list,dic
0,آب انارگیری,"[negin, دستی, اب, انار, گیری, مدل, نگین, گیر, ...","{'negin': 1, 'دستی': 8, 'اب': 10, 'انار': 9, '..."
1,آب سردکن و تصفیه آب و لوازم جانبی,"[تصفیه, فیلتر, اب, سطلی, سوبو, a, کوچک, 928, ت...","{'تصفیه': 791, 'فیلتر': 478, 'اب': 805, 'سطلی'..."
2,آب مرکبات‌گیری,"[tefal, press, مرکبات, اب, پرتقال, zp, citrus,...","{'tefal': 1, 'press': 62, 'مرکبات': 269, 'اب':..."
3,آب معدنی,"[اب, 6, لیتری, معدنی, عددی, 5, باکس, واتا, 1, ...","{'اب': 70, '6': 17, 'لیتری': 35, 'معدنی': 66, ..."
4,آباژور و چراغ خواب,"[چراغ, خواب, نگهبان, سگ, اباژور, چراغ, lol, خو...","{'چراغ': 792, 'خواب': 760, 'نگهبان': 3, 'سگ': ..."


In [None]:
!pip install parsivar
!wget https://www.dropbox.com/s/tlyvnzv1ha9y1kl/spell.zip
!pip install pyspellchecker
!mkdir /usr/local/lib/python3.7/dist-packages/parsivar/resource/spell 
!unzip spell.zip -d /usr/local/lib/python3.7/dist-packages/parsivar/resource/spell

Collecting parsivar
  Downloading parsivar-0.2.3.tar.gz (36.2 MB)
[K     |████████████████████████████████| 36.2 MB 56 kB/s 
[?25hCollecting nltk==3.4.5
  Downloading nltk-3.4.5.zip (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 37.8 MB/s 
Building wheels for collected packages: parsivar, nltk
  Building wheel for parsivar (setup.py) ... [?25l[?25hdone
  Created wheel for parsivar: filename=parsivar-0.2.3-py3-none-any.whl size=36492971 sha256=621230b05814f54b04c70bbc935eb53782839e48961f4db0783ff1cc15c41aff
  Stored in directory: /root/.cache/pip/wheels/ae/67/7a/49cbf08f64d3f76a26eceaf0e481a40e233f05d4356875cbed
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.4.5-py3-none-any.whl size=1449922 sha256=d28eab5c73b5ff78c5674fbcf6f8a528e0de4f233a1e577c7a569c7aba8b84f2
  Stored in directory: /root/.cache/pip/wheels/48/8b/7f/473521e0c731c6566d631b281f323842bbda9bd819eb9a3ead
Successfully built parsivar nltk
Installing collecte

In [None]:
from parsivar import SpellCheck
myspell_checker = SpellCheck()
spell_checker = myspell_checker.spell_corrector("کیبرد")      #good enough
spell_checker

'کی برد'

####matching
this function used for find proper words that is most similar to query search

In [None]:
  #splell cheker add konam va be matchin ezaf konam     test konam کیبردبلوتوثی dar spell chi mishe

In [None]:
def split(word ):
    return [char for char in word]
def matching3 (tex ):
  texted = tex.split(" ")
  len_tex = len(texted)
  c_text = []
  extera = ''
  extera_2 = ''
  m=0
  for text in texted:
    text_r = text
    text = split(text)
    l = len(text)
    q=0
    p = 100 / l
    com_f = 0
    ten =0
    search = dic[text[0]]
    #print(search)
    correct_text = ""
    for s in search:
      com = 0 
      te = 0
     
      
      l_s = len(s)
      i = -1
      for j in range(l_s):
         i+=1
         if i >= l :
           break
         if text[j] == s[j] :
           com+=p
           te +=1
         else:
           te-=1
      if l == l_s  and com>= (l-1)*p-5 :
        #print(l,'--',l_s ,'--',com)
        com+=3
      if l_s > l :
        com = com - (l_s-l)*p
      if com >= 99 :
        correct_text=s
        break
      try:
          if l_s == te and te>ten and te>=3   :       #te>4   beacuse of bad dictionry
              ten = te
              text_r_ = split(text_r)
              text_r_.insert(l_s , " ")
              extera = "".join(text_r_)
              
              extera = matching2(extera)
      except:
        pass     
     
      if com > com_f and com >=70:               # 50 or 45 its diffrent answers
       com_f = com
       correct_text = s
       #print(l_s,s)
       
      if s == search[-1] and extera!='' and com_f<70:
        correct_text = extera.split(' ')
        c_text.append(correct_text[0])
        correct_text = correct_text[1]
      if com> com_f:
        com_f = com
        extera_2 = s
       
    if correct_text == '':
        correct_text = extera_2
    c_text.append(correct_text)   
      
  c_text = " ".join(c_text) 

  return c_text

  

In [None]:
def split(word):
    return [char for char in word]
def matching (tex):
  texted = tex.split(" ")
  c_text = []
  
  for text in texted:
    
    text = split(text)
    l = len(text)
    p = 100 / l
    com_f = 0
    
    search = dic[text[0]]
    correct_text = ""
    for s in search:
      com = 0 
      i = -1
      for j in range(len(s)):
       i += 1
       if i == l :
          break
       if text[i] == s[j] :
         com+=p
      # if l == len(s):
      #   com+=5
      if com >= 99 :
        correct_text=s
        
        break
      if com > com_f and com >=45 :               # 50 or 45 its diffrent answers
       com_f = com
       correct_text = s
       
    c_text.append(correct_text)   
      
  c_text = " ".join(c_text) 
  return c_text

In [None]:
def matching2 (tex ):
  texted = tex.split(" ")
  len_tex = len(texted)
  c_text = []
  extera = ''
  for text in texted:
    text_r = text
    text = split(text)
    l = len(text)
    
    p = 100 / l
    com_f = 0
    
    search = dic[text[0]]
    correct_text = ""
    for s in search:
      com = 0 
      te = 0
      i = -1
      l_s = len(s)
      for j in range(l_s):
         i += 1
         if i == l :
           break
         if text[i] == s[j] :
           com+=p
           te +=1
      if com == 100 :
        correct_text=s
        break

      if com > com_f and com >=99 :               # 50 or 45 its diffrent answers
       com_f = com
       correct_text = s
       
           
       
    c_text.append(correct_text)   
      
  c_text = " ".join(c_text) 

  return c_text

in this block we apply matging and parsivar to correct words
we use this structure

if matchin i != i
  
  d = myspell_checker.spell_corrector(i)       
 
  d = matching(d)            

In [None]:
queries_test_offline_normalized = read_lowmem("/content/drive/MyDrive/data_days/queries_test_offline_normalized.json" )
#queries_test_offline_normalized.drop(['Unnamed: 0'] , inplace=True , axis=1)
queries_test_offline_normalized.head(2)

38724it [00:00, 235814.05it/s]


Unnamed: 0,0
0,تلویزیون 40 ایتچ
1,قهوه ساز delongi


In [None]:
def sent (text):
  j=0
  text = str(text)
  text = text.split(" ")
  text2 = []
  
  for i in text:
    #d = myspell_checker.spell_corrector(i)
    #if i!= d :
    d = matching(i )  
    if d!=i :
      d = myspell_checker.spell_corrector(i)
      d = matching(d )
      text2.append(d)  
    else:
      text2.append(d)
  text2 = " ".join(text2)

  return text2

In [None]:
#this block used to  obtain time of process 
#so all of the query search take 9 minute to finish
ss = queries_test_offline_normalized[0:100].copy()

In [None]:
ss.columns = ['0']

In [None]:
ss

Unnamed: 0,0
0,تلویزیون 40 ایتچ
1,قهوه ساز delongi
2,بند فلزی mi band 5
3,قالی زمردمشهد
4,پوکو x 3 pro
...,...
95,ماژول پاور بانک
96,مورم ایرانسل
97,nokia 5310
98,اداپتور 12 ولت


In [None]:
ss['1'] = ss['0'].apply(lambda x:sent(x))

In [None]:
ss[ss['0']!=ss['1']]

Unnamed: 0,0,1
0,تلویزیون 40 ایتچ,تلویزیون 40 اینچ
3,قالی زمردمشهد,قالی زمرد مشهد
10,فیس برواش,فیس بر واش
35,اسباب بازز,اسباب با ز
37,شیاعومی نوت 8,شیا عامیانه نوت 8
45,مونوپولب,مونوپولی
51,چراغ پارش,چراغ پارس
70,دسته پابجی موبایا,دسته پابجی موبایل
71,جامایع ظرقشویی,جامایع ظرفشویی
74,صندل ودمپایی دخترانه زنانه,صندل و دمپایی دخترانه زنانه


In [None]:
del ss
gc.collect()

563

In [None]:
queries_test_offline_normalized.columns = ['0']

In [None]:
queries_test_offline_normalized['2'] = queries_test_offline_normalized['0'].apply(lambda x:sent(x))

In [None]:
queries_test_offline_normalized.head()

Unnamed: 0,0,2
0,تلویزیون 40 ایتچ,تلویزیون 40 اینچ
1,قهوه ساز delongi,قهوه ساز delongi
2,بند فلزی mi band 5,بند فلزی mi band 5
3,قالی زمردمشهد,قالی زمرد مشهد
4,پوکو x 3 pro,پوکو x 3 pro


now we made all of priority base on each query

In [None]:
# this function obtain a list
# this list used for obtain priority of words in search query
def priority (search):
  search = search.split(' ')
  Length = len(search)
  a = 1.3
  s = 0 
 
  for i in range(Length):
    s = s + pow(a,i)
    if i == Length:
      break 
  j = [100/s]
  for i in range(1, Length):
    j.append(j[i-1]*a)
  j  = Reverse (j)
  return j

def Reverse(lst):
    return [ele for ele in reversed(lst)]

make priority of each query so we our algorithm doesn't need to 
process it each  time

In [None]:
#            queries_test_offline_normalized['3'] = queries_test_offline_normalized['2'].apply(priority)

In [None]:
queries_test_offline_normalized.head(3)

Unnamed: 0,0,2
0,تلویزیون 40 ایتچ,تلویزیون 40 اینچ
1,قهوه ساز delongi,قهوه ساز delongi
2,بند فلزی mi band 5,بند فلزی mi band 5


###use tfidf for priority

In [None]:
feature_names = vectorizer2.get_feature_names()
def tfidf_priority (text) :
  
  responce = vectorizer2.transform([text])
  responce_ = text.split(' ')
  P = []
  N = []
  
  for col in responce.nonzero()[1]:
    P.append(responce[0, col])
    N.append(feature_names[col])
    
  if P == [] or N == []:
    return priority(text)
  
  pri  = []
  
  q = -1 
  for j in responce_:
    q = -1 
    for k in N:
      q+=1
      if k==j :
         pri.append(P[q])
         break
      elif q == len(N)-1 :
        pri.append(0)
  return pri

In [None]:
responce = vectorizer2.transform(['پوکو x 3 pro' ] )
for col in responce.nonzero()[1]:
    print (feature_names[col], ' - ', responce[0, col] , col)

پوکو  -  0.8899441920423984 269772
pro  -  0.45606944103941327 164616


In [None]:
tfidf_priority('قالی زمرد مشهد')

[0.6216100066688267, 0.5564919890738858, 0.5512872805586678]

In [None]:
queries_test_offline_normalized['3'] = queries_test_offline_normalized['2'].apply(tfidf_priority)

In [None]:
queries_test_offline_normalized.head()

Unnamed: 0,0,2,3
0,تلویزیون 40 ایتچ,تلویزیون 40 اینچ,"[0.6445719650479241, 0.5490981558649192, 0.531..."
1,قهوه ساز delongi,قهوه ساز delongi,"[0.37157890150997963, 0.3882606417335175, 0.84..."
2,بند فلزی mi band 5,بند فلزی mi band 5,"[0.4592482489749901, 0.459435878265007, 0.5012..."
3,قالی زمردمشهد,قالی زمرد مشهد,"[0.6216100066688267, 0.5564919890738858, 0.551..."
4,پوکو x 3 pro,پوکو x 3 pro,"[0.8899441920423984, 0, 0, 0.45606944103941327]"


In [None]:
queries_test_offline_normalized.to_csv('queries_tfidf1.csv')

In [None]:
!cp queries_tfidf1.csv /content/drive/MyDrive/data_days

In [None]:
queries_test_offline_normalized.head(10)

Unnamed: 0,0,2,3
0,تلویزیون 40 ایتچ,تلویزیون 40 اینچ,"[0.6445719650479241, 0.5490981558649192, 0.531..."
1,قهوه ساز delongi,قهوه ساز delongi,"[0.37157890150997963, 0.3882606417335175, 0.84..."
2,بند فلزی mi band 5,بند فلزی mi band 5,"[0.4592482489749901, 0.459435878265007, 0.5012..."
3,قالی زمردمشهد,قالی زمرد مشهد,"[0.6216100066688267, 0.5564919890738858, 0.551..."
4,پوکو x 3 pro,پوکو x 3 pro,"[0.8899441920423984, 0, 0, 0.45606944103941327]"
5,xiaomi redmi note 9 s,xiaomi redmi note 9 s,"[0.5709551046579694, 0.5892773172308707, 0.571..."
6,4000 d,4000 d,"[1.0, 0]"
7,لباس انا,لباس انا,"[0.5710109318623898, 0.8209424557748523]"
8,سنگ چشم ببر,سنگ چشم ببر,"[0.5088307384548557, 0.4944385358457975, 0.704..."
9,گلگیر جلو پژو 405 شرکتی,گلگیر جلو پژو 405 شرکتی,"[0.5538573951207876, 0.3678219081081493, 0.381..."


In [None]:
a = [2,3,4]
a = [element * 2 for element in a]
max(a)

8

In [None]:
def x100 (L):
  sum = 0 
  for i in L:
    sum+=i
  
  a = [element *100/sum for element in L]
  return a

In [None]:
queries_test_offline_normalized['3'] = queries_test_offline_normalized['3'].apply(x100)

In [None]:
queries_test_offline_normalized[30:40]

Unnamed: 0,0,2,3
30,لیدوکایین,لیدوکایین,[100.0]
31,چسب ترمیم شیشه,چسب ترمیم شیشه,"[31.66854253864878, 39.81686379526179, 28.5145..."
32,جارو برقی lg,جارو برقی lg,"[37.26937893010778, 28.25724047418974, 34.4733..."
33,گوشی a 52 5 g,گوشی a 52 5 g,"[35.865937549859446, 0.0, 64.13406245014056, 0..."
34,هندزفری سیم دار,هندزفری سیم دار,"[40.53479207916956, 32.10032500612229, 27.3648..."
35,اسباب بازز,اسباب با ز,"[59.1413623833811, 40.85863761661891, 0.0]"
36,بازی پی اس فر,بازی پی اس فر,"[20.737757795524587, 24.054700460799495, 23.99..."
37,شیاعومی نوت 8,شیا عامیانه نوت 8,"[37.39718954819271, 40.49986066924627, 22.1029..."
38,کاور نوکیا سامسونگ b 310 e,کاور نوکیا سامسونگ b 310 e,"[21.59619568769221, 27.994933072078307, 18.339..."
39,شیایومی x 3,شیایومی x 3,"[100.0, 0.0, 0.0]"


from pervious section we know that how many times each word repeat now we apply a new function to all of query serches so we can obtain each word in query search belong to which catgory

In [None]:
df_listed_name.head()

Unnamed: 0,category_name,list,dic
0,آب انارگیری,"[negin, دستی, اب, انار, گیری, مدل, نگین, گیر, ...","{'negin': 1, 'دستی': 8, 'اب': 10, 'انار': 9, '..."
1,آب سردکن و تصفیه آب و لوازم جانبی,"[تصفیه, فیلتر, اب, سطلی, سوبو, a, کوچک, 928, ت...","{'تصفیه': 791, 'فیلتر': 478, 'اب': 805, 'سطلی'..."
2,آب مرکبات‌گیری,"[tefal, press, مرکبات, اب, پرتقال, zp, citrus,...","{'tefal': 1, 'press': 62, 'مرکبات': 269, 'اب':..."
3,آب معدنی,"[اب, 6, لیتری, معدنی, عددی, 5, باکس, واتا, 1, ...","{'اب': 70, '6': 17, 'لیتری': 35, 'معدنی': 66, ..."
4,آباژور و چراغ خواب,"[چراغ, خواب, نگهبان, سگ, اباژور, چراغ, lol, خو...","{'چراغ': 792, 'خواب': 760, 'نگهبان': 3, 'سگ': ..."


In [None]:
lenght_df_listed_name = len(df_listed_name['list'])
def query_category (text):
  
  
  text = text.split(" ")
  le = len(text)
  cat = []
  cn=0   
  catee=''
  for j in text:
    #if j in "0987654321":
      #continue
    for i in range(lenght_df_listed_name):
      
      try:
        c = df_listed_name['dic'][i][j]
      except :
        c=0
      if c > cn :
        cn = c 
        catee = df_listed_name['category_name'][i]
    cat.append(catee)
  return cat


In [None]:
df_listed_name.head(2)


Unnamed: 0,category_name,list,dic
0,آب انارگیری,"[negin, وابمیوه, دستی, اب, hand, juicer, انار,...","{'negin': 1, 'وابمیوه': 1, 'دستی': 8, 'اب': 10..."
1,آب سردکن و تصفیه آب و لوازم جانبی,"[کوچک, اب, 928, a, فیلتر, سوبو, سطلی, تصفیه, ک...","{'کوچک': 7, 'اب': 807, '928': 1, 'a': 19, 'فیل..."


In [None]:
 df_listed_name['dic'][0]['negin']

1

In [None]:
Er()
################################################################################ehra nashavad
lenght_df_listed_name = len(df_listed_name['list'])

def query_category2 (text , P):
  T = text.split(" ")
  cn = 0
  max_value = max(P) 
  max_index = number_list. index(max_value)
  cat = T[p]
  for i in range(lenght_df_listed_name):
      
      try:
        c = df_listed_name['dic'][i][cat]
      except :
        c=0
      cn = c 
      catee = df_listed_name['category_name'][i]
      print(catee)

In [None]:
queries_test_offline_normalized.head(2)

Unnamed: 0,0,2,3
0,تلویزیون 40 ایتچ,تلویزیون 40 اینچ,"[37.35209843269948, 31.819516639317843, 30.828..."
1,قهوه ساز delongi,قهوه ساز delongi,"[23.177961545819624, 24.218517755677862, 52.60..."


In [None]:
queries_test_offline_normalized['4'] = queries_test_offline_normalized['2'].apply(query_category)

In [None]:
queries_test_offline_normalized.head()

Unnamed: 0,0,2,3,4
0,تلویزیون 40 ایتچ,تلویزیون 40 اینچ,"[37.35209843269948, 31.819516639317847, 30.828...","[سایر تلویزیون و لوازم جانبی, سایر تلویزیون و ..."
1,قهوه ساز delongi,قهوه ساز delongi,"[23.177961545819624, 24.218517755677862, 52.60...","[قهوه, قهوه, قهوه]"
2,بند فلزی mi band 5,بند فلزی mi band 5,"[23.059650310254522, 23.06907150200736, 25.170...","[لوازم جانبی ساعت های معمولی و هوشمند, ماشین، ..."
3,قالی زمردمشهد,قالی زمرد مشهد,"[35.9439031562781, 32.17852664520085, 31.87757...","[فرش, انگشتر, انگشتر]"
4,پوکو x 3 pro,پوکو x 3 pro,"[66.11702661620122, 0.0, 0.0, 33.88297338379879]","[کیف و کاور گوشی و تبلت, کیف و کاور گوشی و تبل..."


In [None]:
queries_test_offline_normalized.to_csv('queries_tfidf_finall2.csv')

In [None]:
!cp queries_tfidf_finall2.csv  /content/drive/MyDrive/data_days

###now we use base algorithm

In [None]:
searches_merged_train.head(2)

Unnamed: 0,raw_query,raw_query_normalized,results,result_counts,pages,page_counts,clicks,click_counts
0,گوشی موبایل,گوشی موبایل,"[mmmbf, mmmmq, mmmbn, mmmbi, mmmbg, mmmbo, mmm...","[1783, 1765, 1753, 1731, 1725, 1662, 1607, 148...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 12,...","[2230, 1060, 721, 525, 388, 269, 199, 142, 119...","[mmmbf, mmmbn, mmmbg, mmmbi, mmmmq, mmmbc, mmm...","[441, 369, 314, 286, 272, 245, 243, 229, 185, ..."
1,گوشی,گوشی,"[mmmbf, mmmmq, mmmbn, mmmbi, mmmbg, mmmbo, mmm...","[10878, 10827, 10770, 10716, 10710, 10608, 103...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","[12352, 5683, 3823, 2653, 2070, 1291, 1039, 79...","[mmmbf, mmmbn, mmmmq, mmmbi, mmmbg, mmmbo, mmm...","[3184, 2749, 1819, 1727, 1709, 1485, 1336, 128..."


In [None]:
base_products_normalized.head(2)

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,1920x1080 HD مدل سینمای ویدئو DLP اصلی Video خ...,1920 x 1080 hd مدل سینمای ویدیو dlp اصلی video...
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",حجم کننده (essence) Mascara) ژل ESSENCE and MA...,حجم کننده essence mascara ژل essence and masca...


In [None]:
base_products_normalized['sellers']  [0]

[{'availability': True,
  'date_added': '2018-06-10T17:27:45.418776+00:00',
  'name1': 'پروژکتور Optoma HD142X 1080p 3000 Lumens 3D DLP صفحه اصلی سینمای خانگی',
  'name2': 'Optoma HD142X 1080p 3000 Lumens 3D DLP Home Theater Projector',
  'price': 31852600},
 {'availability': False,
  'date_added': '2018-05-24T16:59:16.465159+00:00',
  'name1': 'ویدئو پروژکتور اوپتوما optoma hd142x : خانگی، 3D، رزولوشن 1920x1080 HD',
  'name2': '',
  'price': 12645000},
 {'availability': False,
  'date_added': '2019-02-17T00:49:49.573764+00:00',
  'name1': 'Video Projector Optoma HD142X',
  'name2': 'دیتا پروژکتور اپتما HD142X',
  'price': 12800000},
 {'availability': False,
  'date_added': '2019-09-24T19:29:08.728217+00:00',
  'name1': 'ویدئو پروژکتور اپتما OPTOMA HD142X',
  'name2': '',
  'price': 15900000},
 {'availability': False,
  'date_added': '2020-01-25T20:08:23.497684+00:00',
  'name1': 'پروژکتور اوپتوما مدل HD142X',
  'name2': '',
  'price': 12814000},
 {'availability': False,
  'date_added'

In [None]:
base_products_normalized['sellers'][0][0]['price']

31852600

In [None]:
price = []
_price= []
for feature in base_products_normalized['sellers'] :
  price.append(feature[0]['price'])
base_products_normalized['price'] = price

In [None]:
date_added= []
for feature in base_products_normalized['sellers'] :
  date = feature[0]['date_added']
  date = date[0:date.find('.')]
  date_added.append(date)
base_products_normalized['date_added'] = date_added

In [None]:
base_products_normalized.head(2)

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized,price,date_added
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,1920x1080 HD مدل سینمای ویدئو DLP اصلی Video خ...,1920 x 1080 hd مدل سینمای ویدیو dlp اصلی video...,31852600,2018-06-10T17:27:45
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",حجم کننده (essence) Mascara) ژل ESSENCE and MA...,حجم کننده essence mascara ژل essence and masca...,60000,2020-06-29T20:02:26


In [None]:
from datetime import datetime

date_time_str = '2018-06-10T17:27:45'
#date_time_str = '18/09/19 01:55:19'

date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%dT%H:%M:%S')
timestamp = datetime.timestamp(date_time_obj)

timestamp

1528651665.0

In [None]:
def str_date (text) :
  #date = datetime.strptime(text, '%Y-%m-%dT%H:%M:%S')
  timestamp = datetime.timestamp(text)
  return timestamp

In [None]:
base_products_normalized['date_added'] = base_products_normalized['date_added'].apply(str_date)

In [None]:
base_products_normalized.head(2)

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized,price,date_added
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,1920x1080 HD مدل سینمای ویدئو DLP اصلی Video خ...,1920 x 1080 hd مدل سینمای ویدیو dlp اصلی video...,31852600,1528652000.0
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",حجم کننده (essence) Mascara) ژل ESSENCE and MA...,حجم کننده essence mascara ژل essence and masca...,60000,1593461000.0


In [None]:
search_click_merge_2 = pd.read_csv('/content/drive/MyDrive/data_days/search_click_merge_2.csv')

In [None]:
search_click_merge_2.drop(['Unnamed: 0'] , axis = 1 , inplace=True)

In [None]:
searches_merged_train.head(2)

Unnamed: 0,raw_query,raw_query_normalized,results,result_counts,pages,page_counts,clicks,click_counts
0,گوشی موبایل,گوشی موبایل,"[mmmbf, mmmmq, mmmbn, mmmbi, mmmbg, mmmbo, mmm...","[1783, 1765, 1753, 1731, 1725, 1662, 1607, 148...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 12,...","[2230, 1060, 721, 525, 388, 269, 199, 142, 119...","[mmmbf, mmmbn, mmmbg, mmmbi, mmmmq, mmmbc, mmm...","[441, 369, 314, 286, 272, 245, 243, 229, 185, ..."
1,گوشی,گوشی,"[mmmbf, mmmmq, mmmbn, mmmbi, mmmbg, mmmbo, mmm...","[10878, 10827, 10770, 10716, 10710, 10608, 103...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","[12352, 5683, 3823, 2653, 2070, 1291, 1039, 79...","[mmmbf, mmmbn, mmmmq, mmmbi, mmmbg, mmmbo, mmm...","[3184, 2749, 1819, 1727, 1709, 1485, 1336, 128..."


In [None]:
search_click_merge_2.head()

Unnamed: 0,raw_query_normalized,r_c_merged,cl_c_merged
0,0 خرید لوازم اشپزخونه اسباب بازی,"[('mchww', 1), ('mkmiq', 1), ('mksxg', 1), ('m...","[('mmisn', 2), ('mmmze', 2), ('mrpgi', 2), ('m..."
1,088,"[('ktxjg', 1), ('ktxji', 1), ('ktxjo', 1), ('m...","[('yqknb', 4), ('mkatq', 2), ('mkztd', 2), ('m..."
2,1 core samsung,"[('bbsjn', 2), ('ubsur', 2), ('upnlg', 2), ('u...","[('mmmkv', 4), ('upnlg', 2), ('mcpdi', 1), ('m..."
3,1 more,"[('magmu', 96), ('maobx', 96), ('magms', 94), ...","[('maobx', 46), ('maajh', 25), ('magmg', 21), ..."
4,1 more true stylish,"[('maajh', 1), ('magmg', 1), ('maohv', 1), ('m...","[('magmg', 4), ('yyirq', 4), ('mzawc', 2)]"


In [None]:
type(search_click_merge_2.iloc[0,1])

str

In [None]:
eval(search_click_merge_2.iloc[0,1])[0]

('mchww', 1)

In [None]:
def str_lis(text) :
  return eval(text)
search_click_merge_2['r_c_merged'] = search_click_merge_2['r_c_merged'].apply(str_lis)
search_click_merge_2['cl_c_merged'] = search_click_merge_2['cl_c_merged'].apply(str_lis)

In [None]:
search_click_merge_2.head()

Unnamed: 0,raw_query_normalized,r_c_merged,cl_c_merged
0,0 خرید لوازم اشپزخونه اسباب بازی,"[(mchww, 1), (mkmiq, 1), (mksxg, 1), (mmbnf, 1...","[(mmisn, 2), (mmmze, 2), (mrpgi, 2), (mylai, 2..."
1,088,"[(ktxjg, 1), (ktxji, 1), (ktxjo, 1), (mcick, 1...","[(yqknb, 4), (mkatq, 2), (mkztd, 2), (mkatt, 1)]"
2,1 core samsung,"[(bbsjn, 2), (ubsur, 2), (upnlg, 2), (uxtdk, 2...","[(mmmkv, 4), (upnlg, 2), (mcpdi, 1), (mmmzn, 1..."
3,1 more,"[(magmu, 96), (maobx, 96), (magms, 94), (maohl...","[(maobx, 46), (maajh, 25), (magmg, 21), (magmu..."
4,1 more true stylish,"[(maajh, 1), (magmg, 1), (maohv, 1), (mazlc, 1...","[(magmg, 4), (yyirq, 4), (mzawc, 2)]"


In [None]:
search_click_merge_2[search_click_merge_2['raw_query_normalized'] =='گوشی موبایل' ]

Unnamed: 0,raw_query_normalized,r_c_merged,cl_c_merged
84261,گوشی موبایل,"[('mmmbf', 1808), ('mmmmq', 1790), ('mmmbn', 1...","[('mmmbf', 443), ('mmmbn', 374), ('mmmbg', 315..."


In [None]:
search_click_merge_2.iloc[0,1][0][0]

'mchww'

In [None]:
base_products_normalized.head(2)

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized,price,date_added
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,1920x1080 HD مدل سینمای ویدئو DLP اصلی Video خ...,1920 x 1080 hd مدل سینمای ویدیو dlp اصلی video...,31852600,1528652000.0
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",حجم کننده (essence) Mascara) ژل ESSENCE and MA...,حجم کننده essence mascara ژل essence and masca...,60000,1593461000.0


In [None]:
base_products_normalized['product_name_normalized'] = base_products_normalized['product_name_normalized'].apply(eli_dup)
base_products_normalized['product_name'] = base_products_normalized['product_name'].apply(eli_dup)

In [None]:
base_products_normalized.to_csv("base_products_normalized_finall.csv")
!cp base_products_normalized_finall.csv /content/drive/MyDrive/data_days

In [None]:
base_products_normalized.head()

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized,price,date_added
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,پروژکتور دیتا HD142X ویدئو 3000 3D، Video opto...,پروژکتور دیتا video 3000 3 dlp optoma 142 خانگ...,31852600,1528652000.0
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",brow Brow مژه (essence) BROW گرم و اورجینال ma...,brow مژه گرم و اورجینال mascara کننده miss and...,60000,1593461000.0
2,yovgc,مام و اسپری,"[{'name1': 'استیک مردانه وی ایت رکسونا V8', 'n...",وی REXONA گرم تعریق ایت Rexona دئودرانت Tuning...,وی deodorant گرم تعریق ایت men دیودرانت وزن مد...,0,1588267000.0
3,yokff,مام و اسپری,[{'name1': 'مام صابونی رکسونا زنانه کاتون درای...,میلی Women کاتن کاتون لیتر گرم (Rexona تعریق م...,میلی deodorant کاتن کاتون لیتر گرم تعریق ml ما...,49000,1586876000.0
4,uduyq,گوشی هوآوی,"[{'name1': 'Huawei Ascend Y550', 'name2': 'Hua...",هوآوی اسند ASCEND Huawei Ascend موبایل Y550 گو...,اسند y موبایل ascend گوشی مدل huawei g وای هوا...,0,1493062000.0


In [None]:
search_click_merge_2['tf_idf_priority'] = search_click_merge_2['raw_query_normalized'].apply(tfidf_priority)

In [None]:
def x100 (L):
  sum = 0 
  for i in L:
    sum+=i
  
  a = [element *100/sum for element in L]
  return a

In [None]:
search_click_merge_2['tf_idf_priority'] = search_click_merge_2['tf_idf_priority'].apply(x100)

In [None]:
search_click_merge_2.head(3)

Unnamed: 0,raw_query_normalized,r_c_merged,cl_c_merged,tf_idf_priority
0,0 خرید لوازم اشپزخونه اسباب بازی,"[(mchww, 1), (mkmiq, 1), (mksxg, 1), (mmbnf, 1...","[(mmisn, 2), (mmmze, 2), (mrpgi, 2), (mylai, 2...","[0.0, 0.0, 22.89784759305035, 38.6704375708426..."
1,088,"[(ktxjg, 1), (ktxji, 1), (ktxjo, 1), (mcick, 1...","[(yqknb, 4), (mkatq, 2), (mkztd, 2), (mkatt, 1)]",[100.0]
2,1 core samsung,"[(bbsjn, 2), (ubsur, 2), (upnlg, 2), (uxtdk, 2...","[(mmmkv, 4), (upnlg, 2), (mcpdi, 1), (mmmzn, 1...","[0.0, 56.93819411118658, 43.06180588881341]"


In [None]:
search_click_merge_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86535 entries, 0 to 86534
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   raw_query_normalized  86535 non-null  object
 1   r_c_merged            86535 non-null  object
 2   cl_c_merged           86535 non-null  object
 3   tf_idf_priority       86535 non-null  object
 4   tf-idf_priority       86535 non-null  object
dtypes: object(5)
memory usage: 3.3+ MB


In [None]:
search_click_merge_2.to_csv('search_click_merge_2_finall2.csv')
!cp search_click_merge_2_finall2.csv /content/drive/MyDrive/data_days

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
search_click_merge_2 = pd.read_csv('/content/drive/MyDrive/data_days/search_click_merge_2_finall2.csv')
search_click_merge_2.drop(['Unnamed: 0'] , axis = 1 , inplace = True)

In [None]:
search_click_merge_2.head(2)

Unnamed: 0,raw_query_normalized,r_c_merged,cl_c_merged,tf_idf_priority,tf-idf_priority
0,0 خرید لوازم اشپزخونه اسباب بازی,"[('mchww', 1), ('mkmiq', 1), ('mksxg', 1), ('m...","[('mmisn', 2), ('mmmze', 2), ('mrpgi', 2), ('m...","[0, 0, 0.4355138004489335, 0.7355062157288711,...","[0.0, 0.0, 22.89784759305035, 38.6704375708426..."
1,088,"[('ktxjg', 1), ('ktxji', 1), ('ktxjo', 1), ('m...","[('yqknb', 4), ('mkatq', 2), ('mkztd', 2), ('m...",[1.0],[100.0]


In [None]:
def str_lis(text) :
  return eval(text)
search_click_merge_2['r_c_merged'] = search_click_merge_2['r_c_merged'].apply(str_lis)
search_click_merge_2['cl_c_merged'] = search_click_merge_2['cl_c_merged'].apply(str_lis)
search_click_merge_2['tf_idf_priority'] = search_click_merge_2['tf_idf_priority'].apply(str_lis)


In [None]:
base_products_normalized = pd.read_csv('/content/drive/MyDrive/data_days/base_products_normalized_finall.csv')
base_products_normalized.drop(['Unnamed: 0'] , axis = 1 , inplace = True)

In [None]:
base_products_normalized.head(2)

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized,price,date_added
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,پروژکتور دیتا HD142X ویدئو 3000 3D، Video opto...,پروژکتور دیتا video 3000 3 dlp optoma 142 خانگ...,31852600,1528652000.0
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",brow Brow مژه (essence) BROW گرم و اورجینال ma...,brow مژه گرم و اورجینال mascara کننده miss and...,60000,1593461000.0


In [None]:
base_products_normalized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1769291 entries, 0 to 1769290
Data columns (total 7 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   _id                      object 
 1   category_name            object 
 2   sellers                  object 
 3   product_name             object 
 4   product_name_normalized  object 
 5   price                    int64  
 6   date_added               float64
dtypes: float64(1), int64(1), object(5)
memory usage: 94.5+ MB


In [None]:
base_products_normalized[base_products_normalized['_id'] == 'mchww'].iloc[0,5]

193000

In [None]:
search_click_merge_2.iloc[0]['raw_query_normalized']

'0 خرید لوازم اشپزخونه اسباب بازی'

In [None]:
search_click_merge_2.iloc[247:249]

Unnamed: 0,raw_query_normalized,r_c_merged,cl_c_merged,tf_idf_priority,tf-idf_priority
247,5 3,"[(mjsnl, 20), (mmbdb, 20), (mmanm, 17), (mmugi...","[(mmbdb, 23), (mmyib, 9), (mmbrs, 2), (mmanm, ...","[56.52173913043479, 43.47826086956522]","[56.52173913043479, 43.47826086956522]"
248,5 g,"[(mmmfl, 941), (mmmmp, 940), (mmytg, 926), (mm...","[(mmmau, 576), (mmmmp, 373), (mmmfl, 307), (mm...","[56.52173913043479, 43.47826086956522]","[56.52173913043479, 43.47826086956522]"


In [None]:
base_products_normalized.head()

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized,price,date_added
0,yxwrs,ویدیو پروژکتور اپتما,[{'name1': 'پروژکتور Optoma HD142X 1080p 3000 ...,پروژکتور دیتا HD142X ویدئو 3000 3D، Video opto...,پروژکتور دیتا video 3000 3 dlp optoma 142 خانگ...,31852600,1528652000.0
1,mbeby,تقویت کننده مژه و ابرو,"[{'name1': 'ژل حالت دهنده ابرو و مژه essence',...",brow Brow مژه (essence) BROW گرم و اورجینال ma...,brow مژه گرم و اورجینال mascara کننده miss and...,60000,1593461000.0
2,yovgc,مام و اسپری,"[{'name1': 'استیک مردانه وی ایت رکسونا V8', 'n...",وی REXONA گرم تعریق ایت Rexona دئودرانت Tuning...,وی deodorant گرم تعریق ایت men دیودرانت وزن مد...,0,1588267000.0
3,yokff,مام و اسپری,[{'name1': 'مام صابونی رکسونا زنانه کاتون درای...,میلی Women کاتن کاتون لیتر گرم (Rexona تعریق م...,میلی deodorant کاتن کاتون لیتر گرم تعریق ml ما...,49000,1586876000.0
4,uduyq,گوشی هوآوی,"[{'name1': 'Huawei Ascend Y550', 'name2': 'Hua...",هوآوی اسند ASCEND Huawei Ascend موبایل Y550 گو...,اسند y موبایل ascend گوشی مدل huawei g وای هوا...,0,1493062000.0


In [None]:
lengh

86535

In [None]:
lengh = len(search_click_merge_2['raw_query_normalized'])
#lengh = 10
sample =2
df = {"raw_query_normalized":[],"product_name":[] ,"product_id":[] ,"Conformity":[],"number":[] ,"price":[] ,"time_added":[] };
for row_c in tqdm(range(lengh)):
  
  row = search_click_merge_2.iloc[row_c]
  c=-1
  for product_c in range(len(row['cl_c_merged'])) :
    c+=1
    product = row['cl_c_merged'][c]
    tf_idf = row['tf_idf_priority']

    if c == sample :
      break 
    k = base_products_normalized[base_products_normalized['_id'] ==product[0] ]
    
    
    df['product_id'].append(product[0])
    query = row['raw_query_normalized']
    df["raw_query_normalized"].append(query)
    df['number'].append(product[1])
    p = k.iloc[0,5]
    df['price'].append(p)
    D = k.iloc[0,6]
    df['time_added'].append(D)
    product_name =  k.iloc[0,4]
    df["product_name"].append(product_name)
    #if row['tf-idf_priority'] ==[]:
    #  row['tf-idf_priority'] = [100]
    per = list_text_comp(product_name ,query,tf_idf )
    df["Conformity"].append(per)



 66%|██████▌   | 56941/86535 [3:09:37<1:44:59,  4.70it/s]

In [None]:
df_2 = pd.DataFrame.from_dict(df )  

In [None]:
df_2.to_csv("train_data2.csv")
!cp train_data2.csv /content/drive/MyDrive/data_days

In [None]:
df.to_csv("train_data.csv")
!cp train_data.csv /content/drive/MyDrive/data_days

In [None]:
df.head()

Unnamed: 0,raw_query_normalized,product_name,product_id,Conformity,number,price,time_added
0,0 خرید لوازم اشپزخونه اسباب بازی,اسباب سازی خونه لگو بازی بزرگ ظرف,mmisn,0.730966,2,178000,1616528000.0
1,088,تراز مدل 088 k دیوالت لیزری dw,yqknb,1.0,4,3500000,1600753000.0
2,1 core samsung,با black 18 رجیستر گیگابایت سیمکارت رم دو simc...,mmmkv,1.400791,4,1740000,1607188000.0
3,1 more,وان سیم هندزفری comfobuds مدل true ess wireles...,maobx,1.0,46,888000,1618845000.0
4,1 more true stylish,earbuds colorbuds headphone وان ورزش گلوبال سی...,magmg,1.727744,4,1490000,1607842000.0


In [None]:
search_click_merge_2.head(2)

Unnamed: 0,raw_query_normalized,r_c_merged,cl_c_merged,tf_idf_priority,tf-idf_priority,tf_idf_priority_normalized
0,0 خرید لوازم اشپزخونه اسباب بازی,"[(mchww, 1), (mkmiq, 1), (mksxg, 1), (mmbnf, 1...","[(mmisn, 2), (mmmze, 2), (mrpgi, 2), (mylai, 2...","[0, 0, 0.4355138004489335, 0.7355062157288711,...","[0.0, 0.0, 22.89784759305035, 38.6704375708426...","[0.0, 0.0, 0.2289784759305035, 0.3867043757084..."
1,088,"[(ktxjg, 1), (ktxji, 1), (ktxjo, 1), (mcick, 1...","[(yqknb, 4), (mkatq, 2), (mkztd, 2), (mkatt, 1)]",[1.0],[100.0],[1.0]


In [None]:
def strr ( text):
  return str(text)

df['raw_query_normalized'] = df['raw_query_normalized'].apply(strr)

In [None]:
df

Unnamed: 0,raw_query_normalized,product_name,product_id,Conformity,number,price,time_added
0,0 خرید لوازم اشپزخونه اسباب بازی,اسباب سازی خونه لگو بازی بزرگ ظرف,mmisn,0.730966,2,178000,1.616528e+09
1,088,تراز مدل 088 k دیوالت لیزری dw,yqknb,1.000000,4,3500000,1.600753e+09
2,1 core samsung,با black 18 رجیستر گیگابایت سیمکارت رم دو simc...,mmmkv,1.400791,4,1740000,1.607188e+09
3,1 more,وان سیم هندزفری comfobuds مدل true ess wireles...,maobx,1.000000,46,888000,1.618845e+09
4,1 more true stylish,earbuds colorbuds headphone وان ورزش گلوبال سی...,magmg,1.727744,4,1490000,1.607842e+09
...,...,...,...,...,...,...,...
86530,ییخچال فریزر,و مدل brb 24 w فوت بست فریزر 2410 یخچال,mmvhf,1.000000,1,7600000,1.611223e+09
86531,ییخچال فریزر دوو,2 d بای mw و رنگ دوو مدل ساید فریزر s 3340 سفی...,mjetu,1.414026,1,32900000,1.585521e+09
86532,ییسیم,طنین gp 23 مدل wireless بی واکی سیم تاکی tanin...,mznoj,0.000000,1,1270000,1.534679e+09
86533,ییونتوس,خرید 2020 2019 یوونتوس اول لباس جدید,mzooy,0.000000,4,85000,1.570265e+09


In [None]:
df.to_csv("train_data.csv")
!cp train_data.csv /content/drive/MyDrive/data_days

In [None]:
  df = pd.DataFrame.from_dict(df)  
df.head()

Unnamed: 0,raw_query_normalized,product_name,product_id,Conformity,number,price,time_added
0,0 خرید لوازم اشپزخونه اسباب بازی,اسباب سازی خونه لگو بازی بزرگ ظرف,mmisn,38.431715,2,178000,1616528000.0
1,0 خرید لوازم اشپزخونه اسباب بازی,اسباب ست لوازم و دمپایی بازی ارایشی,mmmze,61.329562,2,66000,1613644000.0
2,088,تراز مدل 088 k دیوالت لیزری dw,yqknb,100.0,4,3500000,1600753000.0
3,088,تراز مدل 088 k دیوالت لیزری dw خطی,mkatq,100.0,2,2670000,1616204000.0
4,1 core samsung,با black 18 رجیستر گیگابایت سیمکارت رم دو simc...,mmmkv,100.0,4,1740000,1607188000.0


In [None]:
df[20:25]

Unnamed: 0,raw_query_normalized,product_name,product_id,Conformity,number,price,time_added
20,10 دلار,10 سوپر تک کمیاب بانکی دلار 1995 امریکا,mwytb,1.349544,2,889999,1610611000.0
21,10 پرو,note lite mi دوربین camshield 10 pro نیلکین دا...,muaow,0.570732,1,229000,1609981000.0
22,1000 تست,تست تیزهوشان 1000 تحلیلی استعداد ششم مهروماه,mrjoq,1.405323,3,76000,1618071000.0
23,1000 تست استعداد تحلیلی,تست تیزهوشان 1000 تحلیلی استعداد ششم مهروماه,mrjoq,1.960836,5,76000,1618071000.0
24,1000 تست تحلیلی,تست تیزهوشان 1000 تحلیلی استعداد ششم مهروماه,mrjoq,1.697242,3,76000,1618071000.0


In [None]:
search_click_merge_2[20:25]

Unnamed: 0,raw_query_normalized,r_c_merged,cl_c_merged,tf_idf_priority,tf-idf_priority
20,10 دلار,"[(kmhjb, 1), (kmhjm, 1), (kmhjt, 1), (kmhju, 1...","[(mwytb, 2), (umlxs, 2), (uxuhx, 2), (yaebn, 2...","[0.4633883649752082, 0.8861553042247184]","[34.33667065030409, 65.6633293496959]"
21,10 پرو,"[(kixxz, 1), (mebvc, 1), (mhpwt, 1), (mitqj, 1...","[(muaow, 1)]","[0.5707321215278217, 0.8211363135657513]","[41.00474636379352, 58.995253636206485]"
22,1000 تست,"[(mrjgk, 2), (mrjgm, 1), (mrjgu, 1), (mrjgy, 1...","[(mrjoq, 3), (mrjgk, 2), (mrjxn, 2), (mrjgm, 1...","[0.6234984293580716, 0.7818246021890191]","[44.36691176061316, 55.63308823938683]"
23,1000 تست استعداد تحلیلی,"[(mrjcr, 3), (kanvn, 2), (mrjgm, 2), (mrjgy, 2...","[(mrjoq, 5), (mrjgm, 2), (mrjxn, 2), (kaiui, 1...","[0.3522905365601275, 0.44174835995120076, 0.58...","[17.966344469908762, 22.528573379790274, 29.98..."
24,1000 تست تحلیلی,"[(kaiui, 1), (kaiuo, 1), (kanvn, 1), (kchgn, 1...","[(mrjoq, 3), (kchgn, 2), (mrjcr, 2), (mrjgk, 2...","[0.4355340773678108, 0.5461300955135148, 0.715...","[25.661279469676185, 32.17749823956488, 42.161..."


In [None]:
def x100 (L):
  sum = 0 
  for i in L:
    sum+=i
  
  a = [element *1/sum for element in L]
  return a
search_click_merge_2['tf_idf_priority_normalized'] = search_click_merge_2['tf_idf_priority'].apply(x100)

In [None]:
search_click_merge_2.head()

Unnamed: 0,raw_query_normalized,r_c_merged,cl_c_merged,tf_idf_priority,tf-idf_priority,tf_idf_priority_normalized
0,0 خرید لوازم اشپزخونه اسباب بازی,"[(mchww, 1), (mkmiq, 1), (mksxg, 1), (mmbnf, 1...","[(mmisn, 2), (mmmze, 2), (mrpgi, 2), (mylai, 2...","[0, 0, 0.4355138004489335, 0.7355062157288711,...","[0.0, 0.0, 22.89784759305035, 38.6704375708426...","[0.0, 0.0, 0.2289784759305035, 0.3867043757084..."
1,088,"[(ktxjg, 1), (ktxji, 1), (ktxjo, 1), (mcick, 1...","[(yqknb, 4), (mkatq, 2), (mkztd, 2), (mkatt, 1)]",[1.0],[100.0],[1.0]
2,1 core samsung,"[(bbsjn, 2), (ubsur, 2), (upnlg, 2), (uxtdk, 2...","[(mmmkv, 4), (upnlg, 2), (mcpdi, 1), (mmmzn, 1...","[0, 0.7975853582779084, 0.6032060976653837]","[0.0, 56.93819411118658, 43.06180588881341]","[0.0, 0.5693819411118659, 0.4306180588881341]"
3,1 more,"[(magmu, 96), (maobx, 96), (magms, 94), (maohl...","[(maobx, 46), (maajh, 25), (magmg, 21), (magmu...","[0, 1.0]","[0.0, 100.0]","[0.0, 1.0]"
4,1 more true stylish,"[(maajh, 1), (magmg, 1), (maohv, 1), (mazlc, 1...","[(magmg, 4), (yyirq, 4), (mzawc, 2)]","[0, 0.5513701358672192, 0.5431145306784414, 0....","[0.0, 31.912718919683797, 31.434893243675848, ...","[0.0, 0.31912718919683797, 0.3143489324367585,..."


In [None]:
#this block used for return a score that obtain similarity of product_name and query search

def list_text_comp(product_name , text , priority):
  Conformity_percentage = 0
  counter_Conformity = -1 
  #text = correction (text)
  #text = _normalize_text(text )
  #print(text)
  P_text=priority
  text_splited = text.split(" ")
  product_name_splited = product_name.split(" ")

  for counter_text in text_splited:
    counter_Conformity+=1
    for counter_product in product_name_splited:
      if counter_product == counter_text:
        Conformity_percentage += P_text[counter_Conformity]
 
  return Conformity_percentage
  
  

In [None]:
def finall4(text , priority , list_df) :
  try :
    list_df = set(list_df)
    list_df = list(list_df)
    len_df=len(list_df)
    dff = []
    for i in range(len_df):
      k = base_products_normalized[base_products_normalized['category_name'] == list_df[i]].copy()
      dff.append(k)

    dff = pd.concat(dff)
    dff.reset_index(inplace=True)
    base_products_normalized_lenght = len(dff["product_name_normalized"])
    P = max(priority)

    j =0
    counter = 0
    d=0
    c = 0 
    k = 0
    top10 = {"ID":[],"click":[],"Conformity":[]};

    for i in range(base_products_normalized_lenght):
      counter = list_text_comp(dff["product_name_normalized"][i] , text ,priority)
      

In [None]:

def finall3(text , priority,list_df):
  try :
    list_df = set(list_df)
    list_df = list(list_df)
    len_df=len(list_df)
    dff = []
    for i in range(len_df):
      k = base_products_normalized[base_products_normalized['category_name'] == list_df[i]].copy()
      dff.append(k)

    dff = pd.concat(dff)
    dff.reset_index(inplace=True)
    base_products_normalized_lenght = len(dff["product_name_normalized"])
    if len_df == 1:
      P = priority[0]  
    else:
      P = priority[1]                        #priority[0] fo faster answer
    
    #df = pd.DataFrame(data = None, columns=[0 , 1 ,2 ])
    j =0
    counter = 0
    d=0
    c = 0 
    k = 0
    top10 = {"ID":[],"click":[],"Conformity":[]};
    
    for i in range(base_products_normalized_lenght):
      counter = list_text_comp(dff["product_name_normalized"][i] , text ,priority)
      if counter >= P:
        Id = dff["_id"][i]
        #print(counter)
        try:
          
          top10["ID"].append(Id)
          top10["click"].append(dff["click_number"][i])
          top10["Conformity"].append(counter)
          """  
            c+=1
            if counter ==100:
              d+=1
              
            if counter > 60:
              k+=1
            
            
            if   d>=6 or k>=10 :
              df = pd.DataFrame.from_dict(top10)  
              df.sort_values(by=["Conformity", "click"] ,ascending=False, inplace=True)
              return df 
          """  
        except:
          pass
  except:
    top10 = {"ID":[],"click":[],"Conformity":[]};
    top10["ID"].append('mmmmm')
    top10["click"].append(0)
    top10["Conformity"].append(0)
    print(text)
      
  df = pd.DataFrame.from_dict(top10)  
  df.sort_values(by=["Conformity" , "click"] ,ascending=False, inplace=True)
  return df
  

  

test

In [None]:
text = "aspire 7"
list_df = queries_test_offline_normalized[queries_test_offline_normalized['2'] == text].iloc[0,3]
list_df

['قطعات داخلی لپ\u200cتاپ ایسر']

In [None]:
finall3(text , priority(text),list_df)

Unnamed: 0,ID,click,Conformity
9,kgtyp,1,100.000000
11,kgtyl,1,100.000000
93,kkxoi,0,100.000000
94,kkxig,0,100.000000
103,kjbsb,0,100.000000
...,...,...,...
384,kffut,0,56.521739
385,kffuk,0,56.521739
386,mcpwq,0,56.521739
387,kffua,0,56.521739


In [None]:
base_products_normalized[base_products_normalized['_id'] == 'mmmmm']

Unnamed: 0,_id,category_name,sellers,product_name,product_name_normalized,click_number
5563,mmmmm,گوشی شیائومی,[{'name1': 'گوشی موبایل شیائومی Redmi Note 9 M...,"3 capacity ۶۴گیگابایت رسمی 64GB شرکتی, مگاپیکس...",3 capacity رسمی مگاپیکسل polar 2003 مدل خاکستر...,1458


In [None]:
queries_test_offline_normalized.head(2)

Unnamed: 0,0,2,3,4
0,تلویزیون 40 ایتچ,تلویزیون 40 اینچ,"[42.35588972431078, 32.581453634085214, 25.062...","[سایر تلویزیون و لوازم جانبی, سایر تلویزیون و ..."
1,قهوه ساز delongi,قهوه ساز delongi,"[42.35588972431078, 32.581453634085214, 25.062...","[قهوه, قهوه, قهوه]"


In [None]:
def finall4() :
  df = pd.DataFrame(data = None, columns=[0,1,2,3,4,5,6,7,8,9 ])
  j=0
  for i in tqdm(queries_test_offline_normalized['2']):
    #if j == 30:
      #return df
    data = list(finall3(i ,queries_test_offline_normalized['3'][j],queries_test_offline_normalized['4'][j] )[0:10]["ID"])
    df = df.append([data],ignore_index=True )
    j+=1
  return df

In [None]:
queries_test_offline_normalized.iloc[17]

0                                  aspire 7
2                                  aspire 7
3    [56.52173913043479, 43.47826086956522]
4                 [قطعات داخلی لپ‌تاپ ایسر]
Name: 17, dtype: object

In [None]:
result = finall4()

  2%|▏         | 629/38724 [06:33<10:06:10,  1.05it/s]




  2%|▏         | 725/38724 [07:32<3:26:46,  3.06it/s]

8


  7%|▋         | 2607/38724 [26:00<5:07:56,  1.95it/s]




 10%|█         | 3899/38724 [38:44<3:12:55,  3.01it/s]




 14%|█▍        | 5542/38724 [54:51<4:11:52,  2.20it/s]

21


 15%|█▌        | 5821/38724 [57:46<2:39:30,  3.44it/s]

6


 22%|██▏       | 8573/38724 [1:25:44<3:27:23,  2.42it/s]

21


 25%|██▌       | 9779/38724 [1:38:06<2:20:51,  3.42it/s]

9


 26%|██▌       | 9894/38724 [1:39:19<5:34:47,  1.44it/s]

6


 27%|██▋       | 10538/38724 [1:45:45<2:38:37,  2.96it/s]

8


 29%|██▊       | 11123/38724 [1:51:33<2:42:28,  2.83it/s]

21


 31%|███       | 11850/38724 [1:58:33<4:10:13,  1.79it/s]




 36%|███▌      | 13880/38724 [2:19:17<2:35:56,  2.66it/s]

21


 37%|███▋      | 14292/38724 [2:23:36<4:57:15,  1.37it/s]

21


 39%|███▊      | 14934/38724 [2:29:52<2:37:45,  2.51it/s]




 42%|████▏     | 16343/38724 [2:44:06<3:30:04,  1.78it/s]

1


 42%|████▏     | 16371/38724 [2:44:21<2:37:05,  2.37it/s]




 43%|████▎     | 16513/38724 [2:45:51<3:46:52,  1.63it/s]




 45%|████▍     | 17239/38724 [2:53:07<1:41:08,  3.54it/s]

5 4


 47%|████▋     | 18068/38724 [3:01:31<1:50:18,  3.12it/s]

21


 47%|████▋     | 18291/38724 [3:03:40<1:38:14,  3.47it/s]




 48%|████▊     | 18470/38724 [3:05:30<4:21:07,  1.29it/s]




 48%|████▊     | 18662/38724 [3:07:22<3:53:01,  1.43it/s]

1


 49%|████▉     | 19060/38724 [3:11:26<2:32:33,  2.15it/s]




 53%|█████▎    | 20356/38724 [3:24:29<2:27:52,  2.07it/s]




 53%|█████▎    | 20563/38724 [3:26:36<3:29:04,  1.45it/s]

21


 55%|█████▍    | 21231/38724 [3:32:58<2:55:09,  1.66it/s]

8


 55%|█████▌    | 21444/38724 [3:35:09<2:28:44,  1.94it/s]




 56%|█████▋    | 21838/38724 [3:39:00<4:15:31,  1.10it/s]




 57%|█████▋    | 21969/38724 [3:40:18<1:22:08,  3.40it/s]




 58%|█████▊    | 22571/38724 [3:46:06<2:26:00,  1.84it/s]

8


 59%|█████▉    | 22818/38724 [3:48:35<3:30:27,  1.26it/s]




 61%|██████    | 23447/38724 [3:54:48<3:03:17,  1.39it/s]




 61%|██████    | 23672/38724 [3:56:51<1:38:46,  2.54it/s]

9


 62%|██████▏   | 23957/38724 [3:59:40<59:22,  4.15it/s]  




 62%|██████▏   | 24158/38724 [4:01:42<1:58:00,  2.06it/s]




 69%|██████▊   | 26599/38724 [4:25:45<1:50:54,  1.82it/s]




 69%|██████▉   | 26892/38724 [4:28:37<1:13:20,  2.69it/s]

21


 70%|███████   | 27251/38724 [4:32:12<1:42:47,  1.86it/s]




 77%|███████▋  | 29943/38724 [4:59:37<1:38:51,  1.48it/s]




 80%|████████  | 31102/38724 [5:10:54<1:18:28,  1.62it/s]




 85%|████████▌ | 33009/38724 [5:29:40<52:17,  1.82it/s]  

21


 89%|████████▉ | 34510/38724 [5:44:36<22:24,  3.13it/s]

6


 89%|████████▉ | 34523/38724 [5:44:43<27:25,  2.55it/s]

21


 90%|█████████ | 34914/38724 [5:48:25<42:06,  1.51it/s]

6


 92%|█████████▏| 35542/38724 [5:54:50<34:57,  1.52it/s]

6


 95%|█████████▍| 36608/38724 [6:05:10<23:45,  1.48it/s]





 95%|█████████▍| 36661/38724 [6:05:41<16:51,  2.04it/s]





 96%|█████████▌| 37231/38724 [6:11:17<07:55,  3.14it/s]

21
21


 99%|█████████▉| 38240/38724 [6:21:23<07:29,  1.08it/s]

5 4
5 4


100%|██████████| 38724/38724 [6:25:51<00:00,  1.67it/s]



In [None]:
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,mymqh,mgswy,mgswa,mgswb,mpiki,morhq,moetg,morhc,ylzgf,mjovc
1,mybod,myzov,mbfvx,mzghk,mbfvo,mdtxh,mdtsm,mzxuw,mwiat,mzghb
2,mkami,mkamo,mkamw,mlabv,mkhiy,mkamr,mkwxt,bmznr,mvbeh,mtcbn
3,unifg,ungic,mwpeu,mpaaq,mifgj,mosir,mwyth,mjlrz,ynlsf,ucrtw
4,mmiqz,mmipl,mibnd,mmnet,mktqn,mkdov,mnkeh,mnyzb,mkdol,mmobw


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,mymqh,mgswy,mgswa,mgswb,mpiki,morhq,moetg,morhc,ylzgf,mjovc
1,mybod,myzov,mbfvx,mzghk,mbfvo,mdtxh,mdtsm,mzxuw,mwiat,mzghb
2,mkami,mkamo,mkamw,mlabv,mkhiy,mkamr,mkwxt,bmznr,mvbeh,mtcbn
3,unifg,ungic,mwpeu,mpaaq,mifgj,mosir,mwyth,mjlrz,ynlsf,ucrtw
4,mmiqz,mmipl,mibnd,mmnet,mktqn,mkdov,mnkeh,mnyzb,mkdol,mmobw


In [None]:
def read_lowmem(path):
  with open(path) as f:
    res = [json.loads(line) for line in tqdm(f)]
    res = pd.DataFrame(res)
  return res
queries_test_offline = read_lowmem("/content/data/queries_test_offline.json")
queries_test_offline.head()

38724it [00:00, 214363.79it/s]


Unnamed: 0,0
0,تلویزیون 40 ایتچ
1,قهوه سازDelongi
2,بند فلزی mi band 5
3,قالی زمردمشهد
4,پوکوx3 pro


In [None]:
result = pd.concat([queries_test_offline[0], result], axis=1)

In [None]:
result.head()

In [None]:
result2 = result.copy()
result2 = result2.fillna('mmmmm')

In [None]:
result2 = result2.fillna('mmmmm')

In [None]:
result2

In [None]:
result2.iloc[38691]

In [None]:

#result.fillna('mmmmm')
result2.to_csv('result.csv' ,index=False ,header =False )
!cp result.csv /content/drive/MyDrive/re

In [None]:

#result.fillna('mmmmm')
#result.to_csv('result1.csv' ,index=False , header =False)
#!cp queries.csv /content/drive/MyDrive

In [None]:
!pip install git+https://github.com/RoboEpics/roboepics-client.git@d50db61a92ab14f5e95c87679b681a0836a3f6e5
from roboepics_client.roboepics_client import RoboEpicsClient

problem_id = 4  # This value is set according to your user. Do not change it please.
problem_enter_id = 189  # This value is set according to your user. Do not change it please.
roboepics_client = RoboEpicsClient(problem_id, problem_enter_id)

Collecting git+https://github.com/RoboEpics/roboepics-client.git@d50db61a92ab14f5e95c87679b681a0836a3f6e5
  Cloning https://github.com/RoboEpics/roboepics-client.git (to revision d50db61a92ab14f5e95c87679b681a0836a3f6e5) to /tmp/pip-req-build-rm_jn7iu
  Running command git clone -q https://github.com/RoboEpics/roboepics-client.git /tmp/pip-req-build-rm_jn7iu
  Running command git rev-parse -q --verify 'sha^d50db61a92ab14f5e95c87679b681a0836a3f6e5'
  Running command git fetch -q https://github.com/RoboEpics/roboepics-client.git d50db61a92ab14f5e95c87679b681a0836a3f6e5
Open this URL and confirm your login: https://fusion.roboepics.com/oauth2/device?client_id=7126a051-baea-4fe1-bdf8-fde2fdb31f97&user_code=PV6P7K
Login successful.


In [None]:
roboepics_client.submit("/content/drive/MyDrive/result.csv")

In [None]:
!cp result.csv /content/drive/MyDrive/re/results

In [None]:
example1 = "/content/drive/MyDrive/re/results"


In [None]:
with open(example1, "r") as file1:
    print(file1.read(10000))

In [None]:
#برای کلماتی که تعداد کمی برگزیده پیدا می کنند میتوان بقیه آن را با بیشترین کلیک در اون کتگوری انتخاب کرد

In [None]:
#اهمیت تعداد کلیک بالا ها در کتگوری