In [None]:
#Search engine
def read_csv (csvsource):
  import pandas as pd  # For data handling

  df = pd.read_csv(csvsource)
  return df

def full_cleaning(df):
  import re  # For preprocessing
  import pandas as pd  # For data handling
  from time import time  # To time our operations
  from collections import defaultdict  # For word frequency

  import spacy  # For preprocessing

  import logging  # Setting up the loggings to monitor gensim
  logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

  nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
  brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['title'])
  t = time()
  txt = [sub_cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]
  print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))
  df_clean = pd.DataFrame({'clean': txt})
  print('Finish cleaning')
  return df_clean

def sub_cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

def learn_relation_keyword(df_clean):
  import re  # For preprocessing
  import pandas as pd  # For data handling
  from time import time  # To time our operations
  from collections import defaultdict  # For word frequency

  import spacy  # For preprocessing

  import logging  # Setting up the loggings to monitor gensim
  logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
  from gensim.models.phrases import Phrases, Phraser
  sent = [str(row).split() for row in df_clean['clean']]
  phrases = Phrases(sent, min_count=30, progress_per=10000)
  bigram = Phraser(phrases)
  sentences = bigram[sent]
  word_freq = defaultdict(int)
  for sent in sentences:
      for i in sent:
          word_freq[i] += 1
  sorted(word_freq, key=word_freq.get, reverse=True)
  
  #train
  import multiprocessing

  from gensim.models import Word2Vec
  cores = multiprocessing.cpu_count()
  w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)
  t = time()
  w2v_model.build_vocab(sentences, progress_per=10000)
  print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

  t = time()
  w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
  w2v_model.save('keyword2vec_model.bin')
  print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

  w2v_model.save('/content/keyword2vec_model.bin')
  print('Finish learning, save model as keyword2vec_model.bin')

def learn_relation_title(modelpath = './keyword2vec_model.bin' ):
  import multiprocessing
  from coordle.backend.coordle_backend import QueryAppenderIndex
  from gensim.models import Word2Vec
  cores = multiprocessing.cpu_count()
  w2v_model = Word2Vec.load('/content/keyword2vec_model.bin')
  w2v_model.init_sims(replace=True)
  # To demonstrate how the search engine works, we index on a subset of the documents in the dataframe.
  from coordle.backend import QueryAppenderIndex
  ai_index = QueryAppenderIndex(w2v_model.wv.most_similar, n_similars=1)

  ai_index.build_from_df(
    df,
    'id',
    'title',
    'title', 
    verbose=True, 
    use_multiprocessing=True,
    workers=cores-1
  )
  return ai_index

def search_and_show(ai_index, query, max_results = 48, max_body_length = 500):
    '''Searches using the AI Index and shows the result
    
    Args:
        query: Search query
        max_results: Max results to show for each query    
    '''

    docs, scores, errmsgs = ai_index.search(query)
    
    if errmsgs:
        print('The following errors occurred:', errmsgs)
    else:
      related_id_products = []
      if len(docs) == 0:
        return 'Sorry, no results found.'
      else:
        for doc, score in zip(docs[:max_results], scores[:max_results]):
          print(f'{doc.uid}  {str(doc.title)[:70]:<70}  {score:.4f}')
          print('---')
          related_id_products.append(doc.uid)
        return related_id_products

In [None]:
df = read_csv('/content/cshop_ml_public_product.csv')

In [None]:
!python -m spacy validate

⠙ Loading compatibility table...[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.7/dist-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                            
package   en-core-web-sm   en_core_web_sm   [38;5;2m2.3.1[0m   [38;5;2m✔[0m



In [None]:
!pip install nltk

!pip install spacy==2.3.5

!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz

!pip install pyresparser

Collecting spacy==2.3.5
  Using cached spacy-2.3.5-cp37-cp37m-manylinux2014_x86_64.whl (10.4 MB)
Collecting catalogue<1.1.0,>=0.0.7
  Using cached catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting thinc<7.5.0,>=7.4.1
  Using cached thinc-7.4.5-cp37-cp37m-manylinux2014_x86_64.whl (1.0 MB)
Collecting srsly<1.1.0,>=1.0.2
  Using cached srsly-1.0.5-cp37-cp37m-manylinux2014_x86_64.whl (184 kB)
Installing collected packages: srsly, catalogue, thinc, spacy
  Attempting uninstall: srsly
    Found existing installation: srsly 2.4.2
    Uninstalling srsly-2.4.2:
      Successfully uninstalled srsly-2.4.2
  Attempting uninstall: catalogue
    Found existing installation: catalogue 2.0.6
    Uninstalling catalogue-2.0.6:
      Successfully uninstalled catalogue-2.0.6
  Attempting uninstall: thinc
    Found existing installation: thinc 8.0.13
    Uninstalling thinc-8.0.13:
      Successfully uninstalled thinc-8.0.13
  Attempting uninstall: spacy
    Found existing installation: spacy 3.0.7
 

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 7.9 MB/s 


In [None]:
df_clean = full_cleaning(df)
learn_relation_keyword(df_clean)

Time to clean up everything: 0.22 mins
Finish cleaning


INFO - 02:35:42: collecting all words and their counts
INFO - 02:35:42: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 02:35:42: PROGRESS: at sentence #10000, processed 115218 words and 18431 word types
INFO - 02:35:42: PROGRESS: at sentence #20000, processed 212653 words and 50300 word types
INFO - 02:35:42: collected 62583 token types (unigram + bigrams) from a corpus of 265431 words and 25638 sentences
INFO - 02:35:42: merged Phrases<62583 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 02:35:42: Phrases lifecycle event {'msg': 'built Phrases<62583 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.37s', 'datetime': '2021-12-18T02:35:42.451878', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
INFO - 02:35:42: exporting phrases from Phrases<62583 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
I

Time to build vocab: 0.01 mins


INFO - 02:35:44: worker thread finished; awaiting finish of 0 more threads
INFO - 02:35:44: EPOCH - 1 : training on 242969 raw words (89641 effective words) took 0.9s, 100584 effective words/s
INFO - 02:35:45: worker thread finished; awaiting finish of 0 more threads
INFO - 02:35:45: EPOCH - 2 : training on 242969 raw words (90069 effective words) took 0.9s, 103840 effective words/s
INFO - 02:35:46: worker thread finished; awaiting finish of 0 more threads
INFO - 02:35:46: EPOCH - 3 : training on 242969 raw words (90094 effective words) took 0.9s, 95072 effective words/s
INFO - 02:35:47: worker thread finished; awaiting finish of 0 more threads
INFO - 02:35:47: EPOCH - 4 : training on 242969 raw words (90072 effective words) took 0.9s, 105511 effective words/s
INFO - 02:35:47: worker thread finished; awaiting finish of 0 more threads
INFO - 02:35:47: EPOCH - 5 : training on 242969 raw words (89913 effective words) took 0.9s, 103785 effective words/s
INFO - 02:35:48: worker thread finis

Time to train the model: 0.43 mins
Finish learning, save model as keyword2vec_model.bin


In [None]:
!pip install coordle
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz
!pip install git+https://github.com/JonasTriki/inf368-exercise-3-coordle.git

Collecting spacy<3.1.0,>=3.0.0
  Using cached spacy-3.0.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.3 MB)
Collecting catalogue<2.1.0,>=2.0.4
  Using cached catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting thinc<8.1.0,>=8.0.3
  Using cached thinc-8.0.13-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (628 kB)
Collecting srsly<3.0.0,>=2.4.1
  Using cached srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
Installing collected packages: catalogue, srsly, thinc, spacy
  Attempting uninstall: catalogue
    Found existing installation: catalogue 1.0.0
    Uninstalling catalogue-1.0.0:
      Successfully uninstalled catalogue-1.0.0
  Attempting uninstall: srsly
    Found existing installation: srsly 1.0.5
    Uninstalling srsly-1.0.5:
      Successfully uninstalled srsly-1.0.5
  Attempting uninstall: thinc
    Found existing installation: thinc 7.4.5
    Uninstalling thinc-7.4.5:
      Successfully uninstalled thinc-7.4.5
  Attemptin

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz (500.6 MB)
Collecting git+https://github.com/JonasTriki/inf368-exercise-3-coordle.git
  Cloning https://github.com/JonasTriki/inf368-exercise-3-coordle.git to /tmp/pip-req-build-oq8dhs0f
  Running command git clone -q https://github.com/JonasTriki/inf368-exercise-3-coordle.git /tmp/pip-req-build-oq8dhs0f


In [None]:
ai_index = learn_relation_title('/content/keyword2vec_model.bin')
products = search_and_show(ai_index, 'apple')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  """Load JSON language data using the given path as a base. If the provided
INFO - 02:37:17: loading Word2Vec object from /content/keyword2vec_model.bin
INFO - 02:37:17: loading wv recursively from /content/keyword2vec_model.bin.wv.* with mmap=None
INFO - 02:37:17: setting ignored attribute cum_table to None
INFO - 02:37:17: Word2Vec lifecycle event {'fname': '/content/keyword2vec_model.bin', 'datetime': '2021-12-18T02:37:17.749418', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'loaded'}


Text cleaning initilized on 1 workers


Cleaning texts: 100%|██████████| 25638/25638 [00:09<00:00, 2716.59it/s]
Adding to index: 100%|██████████| 25638/25638 [00:19<00:00, 1323.20it/s]


15417  Philodendron Apple Green                                                1.5048
---
13843  HOUZE - 6 Tier 'Apple' Knock Down Cabinet                               0.9029
---
20027  AVALON Apple Cider Vinegar Gummies 60s                                  0.9029
---
13462  HOUZE - 6 Tier 'Apple' Knock Down Cabinet                               0.9029
---
13792  HOUZE - 5 Tier 'Apple' Knock Down Cabinet                               0.9029
---
13385  HOUZE - 5 Tier 'Apple' Knock Down Cabinet                               0.9029
---
20078  21st Century Apple Cider Vinegar 90 Gummies                             0.7524
---
25590  SKINARMA Tekubi Apple Watch Strap 42/44mm                               0.7524
---
25181  SKINARMA Tekubi Apple Watch Strap 42/44mm                               0.7524
---
25216  NOMAD Rugged Strap(FKM) for Apple Watch 45mm/44mm/42mm                  0.7524
---
25631  NOMAD Rugged Strap(FKM) for Apple Watch 45mm/44mm/42mm                  0.7524
---
25543  UAG

In [None]:
products = search_and_show(ai_index, 'Naruto')

16512  Ossayi Remote Control 3D LED Light  Naruto Table Lamps 16 Color USB Ba  100.0000
---


In [None]:
products

[15417,
 13843,
 20027,
 13462,
 13792,
 13385,
 20078,
 25590,
 25181,
 25216,
 25631,
 25543,
 19654,
 2141,
 18845,
 23651,
 2391,
 19774,
 19365,
 643,
 1978,
 812,
 1728,
 1311,
 210,
 1061,
 116,
 18969,
 361,
 1206,
 519,
 19247,
 24448,
 1571,
 25126,
 943,
 24977,
 25392,
 354,
 24949,
 23576,
 25447,
 1929,
 2347,
 21009,
 25027,
 25592,
 1520]