In [1]:
from gensim import corpora, models, similarities
import jieba
import re

# A. find similarity

### Read data

In [2]:
# all documents
docs= ['Dove Body Wash with Pump with skin natural nourishers Deep Moisture shower gel for instantly soft skin and lasting nourishment 1 L',
      'Aveeno Daily Moisturizing Body Wash with Soothing Oat Creamy Shower Gel (Soap Free and Dye Free/Light Fragrance), 33 Fl Oz',
      'Aveeno Skin Relief Body Wash gently cleanses while leaving itchy, dry skin feeling soothed and moisturized',
      'jason natural body wash & shower gel calming lavender 30 oz',
      'nivea nourishing care body wash with nourishing serum 20 fl oz',
      'aveeno protect + hydrate moisturizing sunscreen lotion with broad spectrum spf 30']

In [3]:
# selected document (to find similarity vs. all docs above)
doc_sim= "NIVEA MEN Active Clean Shower Gel (500mL)"

### Data preparation

**1. Remove Punctuation & Nomalization**

In [4]:
# remove special characters & number
docs= [re.sub("[^a-z ]","",doc.lower()) for doc in docs]

In [5]:
docs

['dove body wash with pump with skin natural nourishers deep moisture shower gel for instantly soft skin and lasting nourishment  l',
 'aveeno daily moisturizing body wash with soothing oat creamy shower gel soap free and dye freelight fragrance  fl oz',
 'aveeno skin relief body wash gently cleanses while leaving itchy dry skin feeling soothed and moisturized',
 'jason natural body wash  shower gel calming lavender  oz',
 'nivea nourishing care body wash with nourishing serum  fl oz',
 'aveeno protect  hydrate moisturizing sunscreen lotion with broad spectrum spf ']

In [6]:
# remove special characters & number
doc_sim= re.sub("[^a-z ]","",doc_sim.lower())

**2.Tokenization**

In [7]:
# Split into words
texts= [jieba.lcut(text) for text in docs]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\tangh\AppData\Local\Temp\jieba.cache
Loading model cost 0.805 seconds.
Prefix dict has been built successfully.


In [8]:
print(texts[:3])

[['dove', ' ', 'body', ' ', 'wash', ' ', 'with', ' ', 'pump', ' ', 'with', ' ', 'skin', ' ', 'natural', ' ', 'nourishers', ' ', 'deep', ' ', 'moisture', ' ', 'shower', ' ', 'gel', ' ', 'for', ' ', 'instantly', ' ', 'soft', ' ', 'skin', ' ', 'and', ' ', 'lasting', ' ', 'nourishment', ' ', ' ', 'l'], ['aveeno', ' ', 'daily', ' ', 'moisturizing', ' ', 'body', ' ', 'wash', ' ', 'with', ' ', 'soothing', ' ', 'oat', ' ', 'creamy', ' ', 'shower', ' ', 'gel', ' ', 'soap', ' ', 'free', ' ', 'and', ' ', 'dye', ' ', 'freelight', ' ', 'fragrance', ' ', ' ', 'fl', ' ', 'oz'], ['aveeno', ' ', 'skin', ' ', 'relief', ' ', 'body', ' ', 'wash', ' ', 'gently', ' ', 'cleanses', ' ', 'while', ' ', 'leaving', ' ', 'itchy', ' ', 'dry', ' ', 'skin', ' ', 'feeling', ' ', 'soothed', ' ', 'and', ' ', 'moisturized']]


In [9]:
# remove " "
texts= [[t for t in text if not t in [" "]] for text in texts]

In [10]:
print(texts[:3])

[['dove', 'body', 'wash', 'with', 'pump', 'with', 'skin', 'natural', 'nourishers', 'deep', 'moisture', 'shower', 'gel', 'for', 'instantly', 'soft', 'skin', 'and', 'lasting', 'nourishment', 'l'], ['aveeno', 'daily', 'moisturizing', 'body', 'wash', 'with', 'soothing', 'oat', 'creamy', 'shower', 'gel', 'soap', 'free', 'and', 'dye', 'freelight', 'fragrance', 'fl', 'oz'], ['aveeno', 'skin', 'relief', 'body', 'wash', 'gently', 'cleanses', 'while', 'leaving', 'itchy', 'dry', 'skin', 'feeling', 'soothed', 'and', 'moisturized']]


**3. Optain a dictionary of unique words**

In [11]:
# obtain number of features based on dictionary
dictionary= corpora.Dictionary(texts)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x23e3c5385e0>

In [12]:
print(dictionary)
print(len(dictionary.token2id))
# 56 unique words in the documents

Dictionary(56 unique tokens: ['and', 'body', 'deep', 'dove', 'for']...)
56


In [13]:
# list of features: key is words
print(dictionary.token2id)

{'and': 0, 'body': 1, 'deep': 2, 'dove': 3, 'for': 4, 'gel': 5, 'instantly': 6, 'l': 7, 'lasting': 8, 'moisture': 9, 'natural': 10, 'nourishers': 11, 'nourishment': 12, 'pump': 13, 'shower': 14, 'skin': 15, 'soft': 16, 'wash': 17, 'with': 18, 'aveeno': 19, 'creamy': 20, 'daily': 21, 'dye': 22, 'fl': 23, 'fragrance': 24, 'free': 25, 'freelight': 26, 'moisturizing': 27, 'oat': 28, 'oz': 29, 'soap': 30, 'soothing': 31, 'cleanses': 32, 'dry': 33, 'feeling': 34, 'gently': 35, 'itchy': 36, 'leaving': 37, 'moisturized': 38, 'relief': 39, 'soothed': 40, 'while': 41, 'calming': 42, 'jason': 43, 'lavender': 44, 'care': 45, 'nivea': 46, 'nourishing': 47, 'serum': 48, 'broad': 49, 'hydrate': 50, 'lotion': 51, 'protect': 52, 'spectrum': 53, 'spf': 54, 'sunscreen': 55}


**4. Obtain corpus**

In [14]:
corpus= [dictionary.doc2bow(text) for text in texts]

In [15]:
print(corpus) # feature: number of occurence

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2)], [(0, 1), (1, 1), (5, 1), (14, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(0, 1), (1, 1), (15, 2), (17, 1), (19, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)], [(1, 1), (5, 1), (10, 1), (14, 1), (17, 1), (29, 1), (42, 1), (43, 1), (44, 1)], [(1, 1), (17, 1), (18, 1), (23, 1), (29, 1), (45, 1), (46, 1), (47, 2), (48, 1)], [(18, 1), (19, 1), (27, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1)]]


**5. Find similarity**

In [16]:
# Use TF-IDF model to process corpus
tfidf= models.TfidfModel(corpus)

In [17]:
index= similarities.SparseMatrixSimilarity(tfidf[corpus],num_features= 56)
index

<gensim.similarities.docsim.SparseMatrixSimilarity at 0x23e409f2c40>

In [18]:
# convert search words into Spare Vectors
doc_sim_vector= dictionary.doc2bow(jieba.lcut(doc_sim))

In [19]:
# similarity calculation
sim= index[tfidf[doc_sim_vector]]

In [20]:
sim

array([0.07132261, 0.08075783, 0.        , 0.13390036, 0.3182787 ,
       0.        ], dtype=float32)

In [21]:
# print result
for i in range(len(sim)):
    print(f'doc_sim is similar to docs {[i+1]} : {sim[i]}')

doc_sim is similar to docs [1] : 0.07132260501384735
doc_sim is similar to docs [2] : 0.08075782656669617
doc_sim is similar to docs [3] : 0.0
doc_sim is similar to docs [4] : 0.13390035927295685
doc_sim is similar to docs [5] : 0.3182787001132965
doc_sim is similar to docs [6] : 0.0


# B.Implement Word2Vec model

- In real-life applications, Word2Vec models are created using billions of documents. For instance Google's Word2Vec model is trained using 3 million words and phrases.
- for the sake of simplicity, we will create a Word2Vec model using a Single Wikipedia article

### Read data

In [22]:
import bs4 as bs
import urllib.request
import re
import nltk

In [23]:
scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In [24]:
print(article_text)



Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans.
Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.[a]
Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers.[b]
AI applications include advanced web search engines (e.g., Google), recommendation systems (used by YouTube, Amazon and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Tesla), automated decision-making and competing at the highest level in strategic game systems (such as chess and Go).[2][citation needed]
As machines become increasingly capable, tasks consid

### Data Preparation

**1. Remove Punctuation & Nomalization**

In [25]:
# Normalization
processed_article = article_text.lower()

In [26]:
processed_article[:1000]

'\n\nartificial intelligence (ai) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans.\nleading ai textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.[a]\nsome popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major ai researchers.[b]\nai applications include advanced web search engines (e.g., google), recommendation systems (used by youtube, amazon and netflix), understanding human speech (such as siri and alexa), self-driving cars (e.g., tesla), automated decision-making and competing at the highest level in strategic game systems (such as chess and go).[2][citation needed]\nas machines become increasingly capable, tasks

In [27]:
# Remove punctuation
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )

In [28]:
processed_article[:1000]

'  artificial intelligence  ai  is intelligence demonstrated by machines  as opposed to natural intelligence displayed by animals including humans  leading ai textbooks define the field as the study of  intelligent agents   any system that perceives its environment and takes actions that maximize its chance of achieving its goals  a  some popular accounts use the term  artificial intelligence  to describe machines that mimic  cognitive  functions that humans associate with the human mind  such as  learning  and  problem solving   however  this definition is rejected by major ai researchers  b  ai applications include advanced web search engines  e g   google   recommendation systems  used by youtube  amazon and netflix   understanding human speech  such as siri and alexa   self driving cars  e g   tesla   automated decision making and competing at the highest level in strategic game systems  such as chess and go      citation needed  as machines become increasingly capable  tasks consi

In [29]:
# remove space
processed_article = re.sub(r'\s+', ' ', processed_article)
processed_article[:1000]

' artificial intelligence ai is intelligence demonstrated by machines as opposed to natural intelligence displayed by animals including humans leading ai textbooks define the field as the study of intelligent agents any system that perceives its environment and takes actions that maximize its chance of achieving its goals a some popular accounts use the term artificial intelligence to describe machines that mimic cognitive functions that humans associate with the human mind such as learning and problem solving however this definition is rejected by major ai researchers b ai applications include advanced web search engines e g google recommendation systems used by youtube amazon and netflix understanding human speech such as siri and alexa self driving cars e g tesla automated decision making and competing at the highest level in strategic game systems such as chess and go citation needed as machines become increasingly capable tasks considered to require intelligence are often removed 

**2.Tokenize & remove stopword**

In [30]:
# break into sentences
all_sentences = nltk.sent_tokenize(processed_article)

In [31]:
all_sentences

[' artificial intelligence ai is intelligence demonstrated by machines as opposed to natural intelligence displayed by animals including humans leading ai textbooks define the field as the study of intelligent agents any system that perceives its environment and takes actions that maximize its chance of achieving its goals a some popular accounts use the term artificial intelligence to describe machines that mimic cognitive functions that humans associate with the human mind such as learning and problem solving however this definition is rejected by major ai researchers b ai applications include advanced web search engines e g google recommendation systems used by youtube amazon and netflix understanding human speech such as siri and alexa self driving cars e g tesla automated decision making and competing at the highest level in strategic game systems such as chess and go citation needed as machines become increasingly capable tasks considered to require intelligence are often removed

In [32]:
# break into words
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]
type(all_words)

list

In [33]:
len(all_words)

1

In [34]:
print(all_words[:100])

[['artificial', 'intelligence', 'ai', 'is', 'intelligence', 'demonstrated', 'by', 'machines', 'as', 'opposed', 'to', 'natural', 'intelligence', 'displayed', 'by', 'animals', 'including', 'humans', 'leading', 'ai', 'textbooks', 'define', 'the', 'field', 'as', 'the', 'study', 'of', 'intelligent', 'agents', 'any', 'system', 'that', 'perceives', 'its', 'environment', 'and', 'takes', 'actions', 'that', 'maximize', 'its', 'chance', 'of', 'achieving', 'its', 'goals', 'a', 'some', 'popular', 'accounts', 'use', 'the', 'term', 'artificial', 'intelligence', 'to', 'describe', 'machines', 'that', 'mimic', 'cognitive', 'functions', 'that', 'humans', 'associate', 'with', 'the', 'human', 'mind', 'such', 'as', 'learning', 'and', 'problem', 'solving', 'however', 'this', 'definition', 'is', 'rejected', 'by', 'major', 'ai', 'researchers', 'b', 'ai', 'applications', 'include', 'advanced', 'web', 'search', 'engines', 'e', 'g', 'google', 'recommendation', 'systems', 'used', 'by', 'youtube', 'amazon', 'and', 

In [35]:
# Removing Stop Words
from nltk.corpus import stopwords

In [36]:
stop_words = stopwords.words('english')

In [37]:
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stop_words]

In [38]:
len(all_words)
#contains the list of all the words in the article

1

In [39]:
print(all_words[:100])

[['artificial', 'intelligence', 'ai', 'intelligence', 'demonstrated', 'machines', 'opposed', 'natural', 'intelligence', 'displayed', 'animals', 'including', 'humans', 'leading', 'ai', 'textbooks', 'define', 'field', 'study', 'intelligent', 'agents', 'system', 'perceives', 'environment', 'takes', 'actions', 'maximize', 'chance', 'achieving', 'goals', 'popular', 'accounts', 'use', 'term', 'artificial', 'intelligence', 'describe', 'machines', 'mimic', 'cognitive', 'functions', 'humans', 'associate', 'human', 'mind', 'learning', 'problem', 'solving', 'however', 'definition', 'rejected', 'major', 'ai', 'researchers', 'b', 'ai', 'applications', 'include', 'advanced', 'web', 'search', 'engines', 'e', 'g', 'google', 'recommendation', 'systems', 'used', 'youtube', 'amazon', 'netflix', 'understanding', 'human', 'speech', 'siri', 'alexa', 'self', 'driving', 'cars', 'e', 'g', 'tesla', 'automated', 'decision', 'making', 'competing', 'highest', 'level', 'strategic', 'game', 'systems', 'chess', 'go',

**3. Optain a dictionary of unique words**

In [40]:
from gensim.models import Word2Vec

In [41]:
word2vec = Word2Vec(all_words, min_count=2)
#A value of 2 for min_count specifies to include only those words in the Word2Vec model 
#that appear at least twice in the corpus

In [42]:
word2vec

<gensim.models.word2vec.Word2Vec at 0x23e439548b0>

In [43]:
len(word2vec.wv)
# 750 unique words that exist at least twice in the corpus

750

In [44]:
#see the dictionary of unique words that exist at least twice in the corpus
word2vec.wv.key_to_index

{'ai': 0,
 'intelligence': 1,
 'artificial': 2,
 'human': 3,
 'learning': 4,
 'problems': 5,
 'machine': 6,
 'many': 7,
 'used': 8,
 'research': 9,
 'networks': 10,
 'knowledge': 11,
 'agent': 12,
 'intelligent': 13,
 'researchers': 14,
 'also': 15,
 'neural': 16,
 'search': 17,
 'general': 18,
 'logic': 19,
 'machines': 20,
 'use': 21,
 'system': 22,
 'field': 23,
 'symbolic': 24,
 'algorithms': 25,
 'would': 26,
 'may': 27,
 'problem': 28,
 'mind': 29,
 'systems': 30,
 'humans': 31,
 'reasoning': 32,
 'data': 33,
 'could': 34,
 'mathematical': 35,
 'solve': 36,
 'computer': 37,
 'optimization': 38,
 'however': 39,
 'specific': 40,
 'goals': 41,
 'goal': 42,
 'example': 43,
 'decision': 44,
 'developed': 45,
 'number': 46,
 'based': 47,
 'behavior': 48,
 'theory': 49,
 'computing': 50,
 'even': 51,
 'algorithm': 52,
 'information': 53,
 'world': 54,
 'fiction': 55,
 'one': 56,
 'program': 57,
 'level': 58,
 'deep': 59,
 'approaches': 60,
 'recognition': 61,
 'term': 62,
 'difficult': 

**Finding Vectors for a Word**

In [45]:
v1 = word2vec.wv['artificial']

In [46]:
len(v1)

100

In [47]:
v1
#The vector v1 contains the vector representation for the word "artificial". 
#By default, a hundred dimensional vector is created by Gensim Word2Vec. 

array([-1.42290466e-03,  5.43665839e-03, -6.30422495e-03, -4.49720974e-04,
        7.86987413e-03,  3.30994185e-03, -2.59119598e-03,  9.24328342e-03,
       -1.01678325e-02,  4.57855454e-03, -6.20572083e-03, -6.68462971e-03,
        9.10018198e-03,  6.25992136e-04,  7.67299067e-03, -7.07538659e-03,
        5.52959088e-03,  7.22187571e-03, -8.77466239e-03, -1.10086938e-02,
       -5.43525489e-03, -3.96942673e-03, -3.03092739e-03, -9.17318091e-03,
        6.57282304e-03, -4.56776749e-03,  6.88859122e-03,  2.29814369e-03,
       -7.92252552e-03,  4.80014551e-03,  8.98143649e-03, -6.43738825e-03,
       -5.63387107e-03, -3.73657886e-03, -1.02189388e-02,  1.87355385e-03,
       -2.54990387e-04,  2.52499478e-04, -7.73155480e-04, -7.27296807e-03,
       -5.77847520e-03, -9.18835867e-04, -1.39808515e-03,  6.02497300e-03,
        6.08967151e-03,  2.91371578e-03,  5.33135899e-05, -2.32357741e-03,
       -3.50048952e-03, -1.44378093e-04,  2.62725679e-03, -5.62598929e-03,
       -7.28549203e-03, -

**Finding Similar Words**

In [48]:
sim_words = word2vec.wv.most_similar('intelligence')

In [49]:
sim_words
#From the output, you can see the words similar to "intelligence" along with their similarity index

[('conscious', 0.3783529996871948),
 ('may', 0.30903714895248413),
 ('rather', 0.305675208568573),
 ('system', 0.3019578158855438),
 ('like', 0.2993031442165375),
 ('general', 0.2974075675010681),
 ('intelligent', 0.2964823544025421),
 ('u', 0.2839393615722656),
 ('even', 0.2829388380050659),
 ('search', 0.2755674719810486)]