In [31]:
from gensim.models.keyedvectors import KeyedVectors
import textract
import warnings
warnings.filterwarnings('ignore')

In [18]:
# word_vectors = KeyedVectors.load_word2vec_format('~/data/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)
word_vectors = KeyedVectors.load_word2vec_format('~/data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [19]:
word_vectors.most_similar(positive=['cooking', 'potatoes'], topn=5)

[('cook', 0.6973531246185303),
 ('oven_roasting', 0.6754531860351562),
 ('Slow_cooker', 0.6742031574249268),
 ('sweet_potatoes', 0.6600280404090881),
 ('stir_fry_vegetables', 0.6548759341239929)]

In [20]:
word_vectors.doesnt_match("potatoes milk cake computer".split())

'computer'

In [23]:
word_vectors.most_similar(positive=['germany', 'france'], topn=1)

[('europe', 0.7222039699554443)]

In [25]:
word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=5)

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133)]

In [26]:
word_vectors.similarity('princess', 'queen')

0.7070532

In [28]:
word_vectors['phone'][:10]

array([-0.01446533, -0.12792969, -0.11572266, -0.22167969, -0.07373047,
       -0.05981445, -0.10009766, -0.06884766,  0.14941406,  0.10107422],
      dtype=float32)

In [36]:
f = open("../data/mavis-batey-sentences.txt", "r")
text = f.read()

In [64]:
text

'It was a strange little outfit in the cottage.\nOrganisation is not a word you would associate with Dilly Knox.\nWhen I arrived, he said: "Oh, hello, we\'re breaking machines, have you got a pencil?"\nThat was it.\nI was never really told what to do.\nI think, looking back on it, that was a great precedent in my life, because he taught me to think that you could do things yourself without always checking up to see what the book said.\nThat was the way the cottage worked.\nWe were looking at new traffic all the time or where the wheels or the wiring had been changed, or at other new techniques.\nSo you had to work it all out yourself from scratch.\nWhy they had to say that ("Today\'s the day minus three.") I can\'t imagine.\nIt seems rather daft, but they did.\nSo we worked for three days.\nIt was all the nail-biting stuff of keeping up all night working.\nOne kept thinking: "Well, would one be better at it if one had a little sleep or shall we just go on?"--and it did take nearly all 

In [38]:
sentences = text.split('\n')

In [49]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
tokenizer = TreebankWordTokenizer()
stop_words = stopwords.words('english')
punctuation = ['(',')',';',':','[',']',',', '-', '/','.']

In [51]:
sentence_vector = []
for sentence in sentences:
    tokens = tokenizer.tokenize(sentence)
    normalized_tokens = [x.lower() for x in tokens if x not in punctuation and x not in stop_words]
    sentence_vector.append(normalized_tokens)

In [83]:
sentence_vector

[['it', 'strange', 'little', 'outfit', 'cottage'],
 ['organisation', 'word', 'would', 'associate', 'dilly', 'knox'],
 ['when',
  'i',
  'arrived',
  'said',
  '``',
  'oh',
  'hello',
  "'re",
  'breaking',
  'machines',
  'got',
  'pencil',
  '?',
  "''"],
 ['that'],
 ['i', 'never', 'really', 'told'],
 ['i',
  'think',
  'looking',
  'back',
  'great',
  'precedent',
  'life',
  'taught',
  'think',
  'could',
  'things',
  'without',
  'always',
  'checking',
  'see',
  'book',
  'said'],
 ['that', 'way', 'cottage', 'worked'],
 ['we',
  'looking',
  'new',
  'traffic',
  'time',
  'wheels',
  'wiring',
  'changed',
  'new',
  'techniques'],
 ['so', 'work', 'scratch'],
 ['why',
  'say',
  '``',
  'today',
  "'s",
  'day',
  'minus',
  'three.',
  "''",
  'i',
  'ca',
  "n't",
  'imagine'],
 ['it', 'seems', 'rather', 'daft'],
 ['so', 'worked', 'three', 'days'],
 ['it', 'nail-biting', 'stuff', 'keeping', 'night', 'working'],
 ['one',
  'kept',
  'thinking',
  '``',
  'well',
  'would',


In [56]:
from gensim.models.word2vec import Word2Vec
num_features = 300
min_word_count = 3
num_workers = 2
window_size = 6
subsampling = 1e-3

In [84]:
model = Word2Vec(sentence_vector, workers=num_workers, size=num_features, min_count=min_word_count, window=window_size, sample=subsampling)



### discard the unneeded output weights

In [80]:
model.init_sims(replace=True)

In [81]:
model_name = 'mavis-batey-sentences'
model.save('mavis-batey-sentences')

In [85]:
print(model)
words = list(model.wv.vocab)
print(words)

Word2Vec(vocab=9, size=300, alpha=0.025)
['it', 'cottage', 'would', 'i', '``', "''", 'new', 'one', 'message']


In [95]:
test_sentences = [ "the quick brown fox jumps over the lazy dogs",
"Then a cop quizzed Mick Jagger's ex-wives briefly." ]

In [98]:
inputs = [s.split() for s in test_sentences] 

In [99]:
inputs

[['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dogs'],
 ['Then', 'a', 'cop', 'quizzed', 'Mick', "Jagger's", 'ex-wives', 'briefly.']]

In [106]:
model = Word2Vec(sentence_vector, min_count=1)
print(model)

Word2Vec(vocab=150, size=100, alpha=0.025)


In [107]:
words = list(model.wv.vocab)
print(words)

['it', 'strange', 'little', 'outfit', 'cottage', 'organisation', 'word', 'would', 'associate', 'dilly', 'knox', 'when', 'i', 'arrived', 'said', '``', 'oh', 'hello', "'re", 'breaking', 'machines', 'got', 'pencil', '?', "''", 'that', 'never', 'really', 'told', 'think', 'looking', 'back', 'great', 'precedent', 'life', 'taught', 'could', 'things', 'without', 'always', 'checking', 'see', 'book', 'way', 'worked', 'we', 'new', 'traffic', 'time', 'wheels', 'wiring', 'changed', 'techniques', 'so', 'work', 'scratch', 'why', 'say', 'today', "'s", 'day', 'minus', 'three.', 'ca', "n't", 'imagine', 'seems', 'rather', 'daft', 'three', 'days', 'nail-biting', 'stuff', 'keeping', 'night', 'working', 'one', 'kept', 'thinking', 'well', 'better', 'sleep', 'shall', 'go', '--', 'take', 'nearly', 'then', 'large', 'message', 'came', 'how', 'many', 'cruisers', 'submarines', 'absolutely', 'incredible', 'spell', 'he', 'pretended', 'going', 'weekend', 'made', 'sure', 'japanese', 'spy', 'pass', 'cover', 'went', 'co

In [109]:
len(word_vectors.vocab)

3000000

In [110]:
import pandas as pd
vocab = pd.Series(word_vectors.vocab)

Series([], dtype: object)

In [115]:
df = vocab.to_frame()

In [116]:
df.head()

Unnamed: 0,0
</s>,"Vocab(count:3000000, index:0)"
in,"Vocab(count:2999999, index:1)"
for,"Vocab(count:2999998, index:2)"
that,"Vocab(count:2999997, index:3)"
is,"Vocab(count:2999996, index:4)"


In [122]:
vocab.iloc[1000000:1000020]

Starwood_Hotels_HOT       Vocab(count:2000000, index:1000000)
Tammy_Kilborn             Vocab(count:1999999, index:1000001)
aortic_aneurism           Vocab(count:1999998, index:1000002)
Spragins_Hall             Vocab(count:1999997, index:1000003)
Ed_Iacobucci              Vocab(count:1999996, index:1000004)
Seilheimer                Vocab(count:1999995, index:1000005)
Frank_Della_Femina        Vocab(count:1999994, index:1000006)
egoli                     Vocab(count:1999993, index:1000007)
Brivik                    Vocab(count:1999992, index:1000008)
actress_Hema_Malini       Vocab(count:1999991, index:1000009)
singer_Angelique_Kidjo    Vocab(count:1999990, index:1000010)
Authority_PDMA            Vocab(count:1999989, index:1000011)
Shapp                     Vocab(count:1999988, index:1000012)
Joris_den_Blanken         Vocab(count:1999987, index:1000013)
Pascal_Berenguer          Vocab(count:1999986, index:1000014)
Dick_LaHaie               Vocab(count:1999985, index:1000015)
M._FELIP

In [138]:
word_vectors['Illini'][:10]

array([ 0.15625   ,  0.18652344,  0.33203125,  0.55859375,  0.03637695,
       -0.09375   , -0.05029297,  0.16796875, -0.0625    ,  0.09912109],
      dtype=float32)

In [124]:
import numpy as np

In [127]:
wv = word_vectors

In [132]:
euclidean_distance = np.linalg.norm(word_vectors['Illinois'] - word_vectors['Illini'])
euclidean_distance

3.36538

In [128]:
cos_similarity = np.dot(wv['Illinois'], wv['Illini']) / (np.linalg.norm(wv['Illinois']) * np.linalg.norm(wv['Illini']))

In [129]:
cos_similarity

0.5501352

In [130]:
1 - cos_similarity

0.44986480474472046

In [134]:
from nlpia.data.loaders import get_data

In [135]:
cities = get_data('cities')

100%|██████████| 2051/2051 [00:01<00:00, 1118.37it/s]


In [137]:
cities.head(1).T

geonameid,3039154
name,El Tarter
asciiname,El Tarter
alternatenames,"Ehl Tarter,Эл Тартер"
latitude,42.5795
longitude,1.65362
feature_class,P
feature_code,PPL
country_code,AD
cc2,
admin1_code,02


In [139]:
us = cities[(cities.country_code == 'US') & (cities.admin1_code.notnull())].copy()

In [141]:
states = pd.read_csv('http://www.fonz.net/blog/wp-content/uploads/2008/04/states.csv')

In [142]:
states = dict(zip(states.Abbreviation, states.State))

In [143]:
us['city'] = us.name.copy()
us['st'] = us.admin1_code.copy()
us['state'] = us.st.map(states)
us[us.columns[-3:]].head()

Unnamed: 0_level_0,city,st,state
geonameid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4046255,Bay Minette,AL,Alabama
4046274,Edna,TX,Texas
4046319,Bayou La Batre,AL,Alabama
4046332,Henderson,TX,Texas
4046430,Natalia,TX,Texas


In [144]:
vocab = pd.np.concatenate([us.city, us.st, us.state])
vocab = np.array([word for word in vocab if word in wv.wv])
vocab[:10]

array(['Edna', 'Henderson', 'Natalia', 'Yorktown', 'Brighton', 'Berry',
       'Trinity', 'Villas', 'Bessemer', 'Aurora'], dtype='<U15')

In [147]:
city_plus_state = []
for c, state, st in zip(us.city, us.state, us.st):
    if c not in vocab:
        continue
    row = []
    if state in vocab:
        row.extend(wv[c] + wv[state])
    else:
        row.extend(wv[c] + wv[st])
    city_plus_state.append(row)
us_300D = pd.DataFrame(city_plus_state)

In [149]:
us_300D.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.34668,-0.254517,-0.158203,0.163086,0.10791,-0.28418,-0.552734,-0.059082,-0.484375,0.087402,...,0.02832,0.082275,-0.32251,-0.006836,0.25293,-0.262207,-0.132812,0.226562,0.146484,0.079834
1,0.145996,0.07605,-0.058594,0.411133,0.231934,-0.116699,-0.387939,-0.011963,-0.200195,-0.05127,...,0.111572,0.146484,-0.019043,0.274414,0.301758,-0.34375,-0.033691,-0.061523,-0.088379,0.088135
2,0.007324,-0.397095,0.134277,0.263672,-0.00708,-0.129395,-0.299316,-0.253174,-0.47168,0.120117,...,-0.161621,-0.131836,-0.075439,-0.019531,0.459961,-0.212646,0.127686,0.30957,0.024414,0.206055
3,0.248535,-0.042969,-0.239746,0.006104,-0.211426,-0.305908,-0.826172,0.292053,-0.260254,0.311035,...,-0.125427,-0.01709,-0.448242,0.145752,0.350586,-0.59375,0.257812,0.110352,0.010742,0.297852
4,0.210693,-0.336426,-0.231445,0.532715,-0.035583,0.104736,-0.518555,0.189392,-0.192383,0.263184,...,-0.160034,0.21344,-0.556641,-0.168213,0.027344,-0.090576,0.016602,-0.204102,-0.116455,-0.086182


In [150]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
us_300D = get_data('cities_us_wordvectors')
us_2D = pca.fit_transform(us_300D.iloc[:, :300])

100%|██████████| 2064/2064 [00:01<00:00, 1122.82it/s]


In [152]:
%matplotlib inline
import seaborn
from matplotlib import pyplot as plt
from nlpia.plots import offline_plotly_scatter_bubble

In [153]:
df = get_data('cities_us_wordvectors_pca2_meta')

In [154]:
df.head()

Unnamed: 0,x,y,name,population,timezone,state,state_abbreviation,elevation,latitude,longitude,elevation_m,country_code
"Los Angeles, CA",4.93081,-0.936239,"Los Angeles, CA",3971883,America/Los_Angeles,California,CA,89.0,34.05223,-118.24368,96,US
"Chicago, IL",-2.021456,-2.615556,"Chicago, IL",2720546,America/Chicago,Illinois,IL,179.0,41.85003,-87.65005,180,US
"Houston, TX",1.339572,2.57639,"Houston, TX",2296224,America/Chicago,Texas,TX,12.0,29.76328,-95.36327,30,US
"Philadelphia, PA",-1.217258,-1.380793,"Philadelphia, PA",1567442,America/New_York,Pennsylvania,PA,12.0,39.95233,-75.16379,40,US
"Phoenix, AZ",2.04693,-0.480845,"Phoenix, AZ",1563025,America/Phoenix,Arizona,AZ,331.0,33.44838,-112.07404,366,US


In [155]:
html = offline_plotly_scatter_bubble(
    df.sort_values('population', ascending=False)[:350].copy().sort_values('population'),
    filename='plotly_scatter_bubble.html',
    x='x', y='y',
    size_col='population', text_col='name', category_col='timezone',
    xscale=None, yscale=None, # 'log' or None
    layout={}, marker={'sizeref': 3000}
)

In [157]:
import multiprocessing
num_cores = multiprocessing.cpu_count()
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.utils import simple_preprocess
corpus = ['This is the first document ...', 'another document ...']
training_corpus = []
for i, text in enumerate(corpus):
    tagged_doc = TaggedDocument(simple_preprocess(text), [i])
    training_corpus.append(tagged_doc)
model = Doc2Vec(size=100, min_count=2, workers=num_cores, iter=10)
model.build_vocab(training_corpus)
model.train(training_corpus, total_examples=model.corpus_count,epochs=model.iter)



In [158]:
num_cores

8

In [166]:
model.vocabulary.min_count

2

In [167]:
model.infer_vector(simple_preprocess('This is a completely unseen document'), steps=10)

array([-4.3305167e-04, -4.1156127e-03,  4.3367036e-03,  2.9815983e-03,
        4.8823748e-03, -4.4917064e-03,  9.4660651e-04,  1.4293566e-03,
        2.2575750e-03,  1.0674573e-03,  3.0639439e-03, -3.2414796e-03,
       -2.1288441e-03,  1.5201317e-03,  1.7416221e-03,  8.8257121e-04,
       -2.6882859e-04,  3.0692115e-03, -6.5166439e-04,  1.9648245e-03,
        3.3264223e-03, -4.1387873e-03, -7.4989686e-04, -1.5501132e-03,
        3.8710143e-05, -7.8900350e-04, -1.2922015e-03,  3.3742979e-03,
       -1.6506874e-03, -1.6289792e-04, -9.3871640e-05, -4.6166452e-03,
       -3.3299488e-03, -1.5817411e-03, -2.8174839e-03,  4.0226053e-03,
       -4.9695540e-03, -2.3478176e-03,  1.1161816e-03,  4.6355836e-03,
       -1.6166740e-04,  2.9576819e-03,  2.0952635e-03, -3.3852556e-03,
       -1.9980472e-04,  3.4908408e-05, -1.8179971e-03,  2.7284708e-03,
       -3.6137220e-03,  3.2053569e-03, -3.1862042e-03,  8.2726497e-04,
        4.4547595e-04,  2.2719365e-03, -2.6379526e-03,  1.1983778e-03,
      