# Data Processing 

In [1]:
import pandas as pd
import numpy as np
import re

In [29]:
# csv_file = "../Data/phrases.csv"
csv_file = "../Data/all_categories_and_phrases.csv"
# df = pd.read_csv(csv_file,encoding="utf-8")
df = pd.read_csv(csv_file)

In [31]:
final_data = df["Phrase"]
final_data.head()

0                  A Pile Of Coats
1       A Pile Of Coats On The Bed
2      A Shelf Full Of Knickknacks
3          A Sock With Holes In It
4    A Vase Filled With Sunflowers
Name: Phrase, dtype: object

# Embedding - Word2Vec

In [None]:
# Overview 
# --------------------------------------------------
# convert classes into vectors and  embed in a space to identify relationships between data points 
# Model is a basic 2 layer Neural Network 
# Input: parsed words from game's phrases 
# Output / Target: words that are "similar" to the input word 
# Input word will change per round 

In [None]:
# Functionality Steps 
# -----------------------------------------
# Parse phrases into words 
# Tokenize words 
# Similiarity scores: random selection
# Create & fit Word2Vec Model
# Predict words: outcome will be associted with similarity score 
# Extract letters from predicted word


In [32]:
# Dependencies 
from nltk.tokenize import sent_tokenize, word_tokenize 
import nltk
import warnings 
import gensim 
from gensim.models import Word2Vec 
import multiprocessing 

In [34]:
# Override warning notifications 
warnings.filterwarnings(action = 'ignore')

# Parse phrases into words
# words_clean = final_data["words"]
words_clean = final_data
words_clean = words_clean.replace("/n"," ")
# convert to acceptable format for tokenization
words_clean = str(words_clean)
words_clean

'0                                A Pile Of Coats\n1                     A Pile Of Coats On The Bed\n2                    A Shelf Full Of Knickknacks\n3                        A Sock With Holes In It\n4                  A Vase Filled With Sunflowers\n5                              A Vase Of Flowers\n6             A Vase Of Fresh Cut Garden Flowers\n7                                 Accent Cabinet\n8                               Accent Furniture\n9                                  Accent Window\n10                                  Address Book\n11                                Address Labels\n12                          Adjustable Shoe Rack\n13                       Adjustable Swivel Stool\n14                                    Afghan Rug\n15                            After-Shave Lotion\n16                              Air Conditioning\n17                                 Air Freshener\n18                                  Air Mattress\n19                                   Alarm Clock\

In [35]:
# Tokenize

words_tokenized = [] 
for i in sent_tokenize(words_clean): 
    temp = [] 
    
    #tokenize phrases into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    words_tokenized.append(temp)
words_tokenized

[['0',
  'a',
  'pile',
  'of',
  'coats',
  '1',
  'a',
  'pile',
  'of',
  'coats',
  'on',
  'the',
  'bed',
  '2',
  'a',
  'shelf',
  'full',
  'of',
  'knickknacks',
  '3',
  'a',
  'sock',
  'with',
  'holes',
  'in',
  'it',
  '4',
  'a',
  'vase',
  'filled',
  'with',
  'sunflowers',
  '5',
  'a',
  'vase',
  'of',
  'flowers',
  '6',
  'a',
  'vase',
  'of',
  'fresh',
  'cut',
  'garden',
  'flowers',
  '7',
  'accent',
  'cabinet',
  '8',
  'accent',
  'furniture',
  '9',
  'accent',
  'window',
  '10',
  'address',
  'book',
  '11',
  'address',
  'labels',
  '12',
  'adjustable',
  'shoe',
  'rack',
  '13',
  'adjustable',
  'swivel',
  'stool',
  '14',
  'afghan',
  'rug',
  '15',
  'after-shave',
  'lotion',
  '16',
  'air',
  'conditioning',
  '17',
  'air',
  'freshener',
  '18',
  'air',
  'mattress',
  '19',
  'alarm',
  'clock',
  '20',
  'alarm',
  'clock',
  'with',
  'nature',
  'sounds',
  '21',
  'all-in-one',
  'home',
  'gym',
  '22',
  'all-purpose',
  'cl

In [43]:
# Evaluation 
# Skip-Gram calculation 

model_gram = gensim.models.Word2Vec(words_tokenized, min_count = 1, size = 100, window = 50, sg = 1)

# Print similarity score 
model_gram.similarity('hand','soap')

0.9995105

In [54]:
#Cosine calculation 
model_cosine = gensim.models.Word2Vec(words_tokenized, min_count = 1,  size = 150, window = 50)

#Print similarity score 
model_cosine.similarity("hand","soap")

0.1614612

In [69]:
# Create & fit Word2Vec Model (Simple 2 layer Neural Network)
# window: number of words to consider to target from, n words from right and n words from left
# iter: epochs
# min_count: how many times a word needs to appear to be a component of the library
model_w2v = Word2Vec(words_tokenized, size=10000, window=100, min_count=1, iter=5, workers=multiprocessing.cpu_count(), sg=1)

In [70]:
# Predictions - similar words 
# Predict similar of word that is passed from the front end (first word of the round's phrase)

# receive word from front end (per round)
input_word = "coats"
word_vectors = model_w2v.wv
similar_words = word_vectors.similar_by_word(input_word)
similar_words


[('75451', 0.9999926090240479),
 ('cold', 0.9999925494194031),
 ('car', 0.9999924898147583),
 ('antique', 0.9999924898147583),
 ('object', 0.9999924302101135),
 ('working', 0.9999924302101135),
 ('hot', 0.9999923706054688),
 ('75462', 0.9999923706054688),
 ('spring', 0.9999923706054688),
 ('sipping', 0.9999923706054688)]

In [71]:
# Letters selection - parse words 
similar_words_clean = np.asarray(similar_words)

def Extract(similar_words_clean):
    return [item[0] for item in similar_words_clean]
print(Extract(similar_words_clean))
similar_words_clean = Extract(similar_words_clean)

['75451', 'cold', 'car', 'antique', 'object', 'working', 'hot', '75462', 'spring', 'sipping']


In [72]:
df = pd.DataFrame(similar_words_clean)
test = df.iloc[0]

chars = sorted(list(set("fireplace")))
chars

['a', 'c', 'e', 'f', 'i', 'l', 'p', 'r']

# Issues / Pending 

In [None]:
# Array of letters (after parsing phrases) holds special characters and numbers, can we remove them? 
# Ultimate goal is to create a bucket of letters, these come from parsing the predicted (similar words), do we need to create a for loop? 
