<a href="https://colab.research.google.com/github/nschantz21/semantic_search/blob/develop/notebooks/SemanticSimilarityTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#import nltk
#nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Python program to generate word vectors using Word2Vec
  
# importing all necessary modules
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action = 'ignore')

In [3]:
import gensim
from gensim.models import Word2Vec

In [6]:
#  Reads ‘alice.txt’ file
sample = open("alice.txt", "r")
s = sample.read()
  
# Replaces escape character with space
f = s.replace("\n", " ")
  
data = []


# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []
      
    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())
  
    data.append(temp)
  
# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count = 1, 
                              size = 100, window = 5)

# Print results
print("Cosine similarity between 'alice' " + 
               "and 'wonderland' - CBOW : ",
    model1.similarity('alice', 'wonderland'))
      
print("Cosine similarity between 'alice' " +
                 "and 'machines' - CBOW : ",
      model1.similarity('alice', 'machines'))
  
# Create Skip Gram model
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100,
                                             window = 5, sg = 1)
  
# Print results
print("Cosine similarity between 'alice' " +
          "and 'wonderland' - Skip Gram : ",
    model2.similarity('alice', 'wonderland'))
      
print("Cosine similarity between 'alice' " +
            "and 'machines' - Skip Gram : ",
      model2.similarity('alice', 'machines'))

Cosine similarity between 'alice' and 'wonderland' - CBOW :  0.9992941
Cosine similarity between 'alice' and 'machines' - CBOW :  0.96279436
Cosine similarity between 'alice' and 'wonderland' - Skip Gram :  0.86994
Cosine similarity between 'alice' and 'machines' - Skip Gram :  0.8468063


In [8]:
model2.wv.similar_by_word("alice")

[('thought', 0.9960631132125854),
 ('hatter', 0.990753173828125),
 ('”', 0.9900180697441101),
 ('said', 0.9899085760116577),
 ('it', 0.9868391752243042),
 ('well', 0.98663330078125),
 (';', 0.9845244884490967),
 ('but', 0.9843130111694336),
 (',', 0.9836928844451904),
 ('?', 0.9836686253547668)]

In [None]:
# imports
import os
from google.cloud import bigquery
from google.oauth2 import service_account
import json

key_file_path = "/content/drive/MyDrive/NWO Project/nwo-sample-5f8915fdc5ec.json"
f = open(key_file_path).read()

# constants
AUTH_KEY = json.loads(f)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_file_path


TWITTER_DB_NAME = "nwo-sample.graph.tweets"
REDDIT_DB_NAME = "nwo-sample.graph.reddit"

client = bigquery.Client()

twitter_table = client.get_table(TWITTER_DB_NAME)
reddit_table = client.get_table(REDDIT_DB_NAME)

In [73]:
union_query = """
(
SELECT
    reddit.created_utc as unix_time,
    reddit.body as content
FROM
    {0}  as reddit TABLESAMPLE SYSTEM (1 PERCENT)

WHERE
    reddit.created_utc is not NULL and
    reddit.body is not NULL
)
UNION ALL
(
SELECT
    -- make the twitter time stamp unix time
    UNIX_SECONDS(PARSE_TIMESTAMP("%F %X", twitter.created_at)) as unix_time,
    twitter.tweet as content
FROM
    {1} as twitter TABLESAMPLE SYSTEM (1 PERCENT) 
    
WHERE
    twitter.created_at is not NULL and
    twitter.tweet is not NULL
)
LIMIT {2}
""".format(
    REDDIT_DB_NAME,
    TWITTER_DB_NAME,
    10000
)

In [74]:
client.query(union_query)
union_query_job = client.query(union_query)
union_rows = union_query_job.result()

In [75]:
# check number of rows
union_rows.total_rows

10000

In [76]:
# open query path
# query top 1000 rows

# make table iterator
  
data = []

# iterate through each row content
for h, i in union_rows:
    for i2 in sent_tokenize(i):
        temp = []
        
        # tokenize the sentence into words
        for j in word_tokenize(i2):
            temp.append(j.lower())
    
        data.append(temp)

In [77]:
# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count = 1, 
                              size = 100, window = 5)

In [78]:
# persist word vectors to disk
word_vectors = model1.wv
word_vectors.save("/content/drive/MyDrive/NWO Project/model1_vectors.kv")

In [79]:
similarity_result = word_vectors.most_similar("apple")
similarity_result[:5]

[('housing', 0.998363196849823),
 ('”', 0.9983266592025757),
 ('inside', 0.9980441331863403),
 ('firm', 0.9977648854255676),
 ('national', 0.9976845979690552)]

In [80]:
# Create Skip Gram model
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100,
                                             window = 5, sg = 1)

In [81]:
# persist word vectors to disk
word_vectors_m2 = model2.wv
word_vectors_m2.save("/content/drive/MyDrive/NWO Project/model2_vectors.kv")

In [85]:
type(word_vectors)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [86]:
similarity_result_m2 = word_vectors_m2.most_similar("apple", topn=10)
similarity_result_m2

[('master', 0.9975461959838867),
 ('county', 0.9973716735839844),
 ('star', 0.9960933923721313),
 ('festival', 0.9957079291343689),
 ('central', 0.9952035546302795),
 ('law', 0.9950966835021973),
 ('giveaway', 0.9950230121612549),
 ('album', 0.9947937726974487),
 ('forum', 0.9946258068084717),
 ('jersey', 0.9943627119064331)]

In [93]:
word_vectors_m2.similar_by_word("poo")

AttributeError: ignored