## Google Colab

In [0]:
!pip install -U -q PyDrive

In [0]:
!pip install --upgrade gensim

In [0]:
!pip install -U spacy

In [0]:
!python -m spacy download en

In [0]:
import codecs
import spacy

nlp = spacy.load('en')
nlp.max_length = 2000000

def tokenize_file(file_name):  
  sents = []
  with codecs.open(file_name, "r", "utf-8") as file:
    doc = nlp(file.read())
    sentence = []
    for t in doc:
      if t.is_sent_start:
        if len(sentence)>0:
          sents.append(sentence)
        sentence = []        
      if not t.is_space and not t.is_stop and not t.is_punct:
        sentence.append(t.text)
        
  return sents

In [14]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

folder_list = drive.ListFile({'q': "'root' in parents and trashed=false and mimeType='application/vnd.google-apps.folder'"}).GetList()
for folder in folder_list:
  if folder['title'] == 'resume-corpus':
    req_folder_id = folder['id']
    break

sentences = []
file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % req_folder_id}).GetList()
for file in file_list:
  print('file: %s, id: %s' % (file['title'], file['id']))
  download = drive.CreateFile({'id': file['id']})
  download.GetContentFile(file['title'])
  sentences.extend(tokenize_file(file['title']))
  print('sentences:', len(sentences))

file: corpus-58.txt, id: 1Kn7Of08cwRmPjFCiRRZAQRmCdWUNuMhl
sentences: 25
file: corpus-59.txt, id: 1pfFwObTaT1pWp7AW2jC5o137kHOWL5jH
sentences: 62
file: corpus-63.txt, id: 1UnoScKMzOdCB2qNjmPUd1Tgl_0DCPoaK
sentences: 83
file: corpus-60.txt, id: 1Hz2XOLt9t33Yqfu1XGmJBci0paMzg1ce
sentences: 83
file: corpus-61.txt, id: 1Ts0g1fl8Ppq4HEGwL2f5EAye6IWfDgTJ
sentences: 125
file: corpus-57.txt, id: 1bQUX71UfHZd8K4O3RTOSb7BQ1RPmnr35
sentences: 152
file: corpus-65.txt, id: 1hNihXNbHeG1GmDl_IxYRTIkszchRKxFp
sentences: 164
file: corpus-64.txt, id: 1Xaz1-1OI23VnmEs58ICMG34kyXdW4lgp
sentences: 209
file: corpus-56.txt, id: 1xRGjOUHMH-zQHLzPi-YwexFZn_pWW7CG
sentences: 242
file: corpus-49.txt, id: 16vrlH11w8hP5XApYO1CKv9xX7v_BOozU
sentences: 242
file: corpus-48.txt, id: 199fYMT1wpYxNlAsaDpKynKgM6sZ2s2Mu
sentences: 287
file: corpus-50.txt, id: 1s2Izj_JOEpz938w0L1ZrxRh0psmupgi5
sentences: 313
file: corpus-45.txt, id: 1lZTKKoxb1sa_YCrW1mmCBc2W4YKeXobI
sentences: 343
file: corpus-52.txt, id: 1cIR-hmOLK2-2npES

sentences: 1731
file: corpus-3.txt, id: 1yu46RDFry1sWRaEr5VibRP4V433Mmgg0
sentences: 1794
file: corpus-4.txt, id: 1BKjCOcnfFwGHe4zR3mhOkMLqXT9oqVCK
sentences: 1809
file: corpus-6.txt, id: 1p_81Bd3OnvZ27dRmhIO2YV4eGkt2xAjb
sentences: 1824
file: corpus-8.txt, id: 1MIA6aHYGJVlKpZX0S62WxbDAs97F3TT5
sentences: 1911
file: corpus-9.txt, id: 14XBnUqV0mLxM_YWXL6CJL1VmsrG78u2w
sentences: 1945
file: corpus-10.txt, id: 1RUxcIUiA198T8IVCtqjFkOBdaRqYnF7f
sentences: 1971
file: corpus-5.txt, id: 1Xh649SduLK1KKdkc9mei8bS7CMIR8tkD
sentences: 1983
file: corpus-1.txt, id: 1QTjEZNRp688wpoVNzPslaRkUkddmGxa4
sentences: 2005


In [15]:
sentences[0:5]

[['Jamie', 'Domingo', 'San', 'Francisco', 'CA', '650', '255', '5351'],
 ['contact@jamiedomingo.com'],
 ['https://github.com/jamiemd',
  'PROJECTS',
  'Decision',
  'Jam',
  'Decision',
  'making',
  'voting',
  'app'],
 ['React', 'Express', 'Node', 'MongoDB'],
 ['A',
  'month',
  'project',
  'I',
  'collaborated',
  'members',
  'come',
  'design',
  'implementation']]

In [0]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [0]:
import gensim.models.word2vec as w2v

model = w2v.Word2Vec(min_count=3, 
                     window=7, 
                     size=100,
                     seed=42)

In [22]:
model.build_vocab(sentences)

2018-07-31 09:18:01,138 : INFO : collecting all words and their counts
2018-07-31 09:18:01,139 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-31 09:18:01,151 : INFO : collected 6121 word types from a corpus of 17642 raw words and 2005 sentences
2018-07-31 09:18:01,153 : INFO : Loading a fresh vocabulary
2018-07-31 09:18:01,162 : INFO : effective_min_count=3 retains 1292 unique words (21% of original 6121, drops 4829)
2018-07-31 09:18:01,163 : INFO : effective_min_count=3 leaves 11794 word corpus (66% of original 17642, drops 5848)
2018-07-31 09:18:01,171 : INFO : deleting the raw counts dictionary of 6121 items
2018-07-31 09:18:01,172 : INFO : sample=0.001 downsamples 66 most-common words
2018-07-31 09:18:01,173 : INFO : downsampling leaves estimated 10669 word corpus (90.5% of prior 11794)
2018-07-31 09:18:01,181 : INFO : estimated required memory for 1292 words and 100 dimensions: 1679600 bytes
2018-07-31 09:18:01,183 : INFO : resetting layer weigh

In [23]:
print(model)

Word2Vec(vocab=1292, size=100, alpha=0.025)


In [24]:
model.train(sentences, 
            total_examples=len(sentences), 
            epochs=100)

2018-07-31 09:18:08,373 : INFO : training model with 3 workers on 1292 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=7
2018-07-31 09:18:08,395 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:08,408 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:08,413 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:08,414 : INFO : EPOCH - 1 : training on 17642 raw words (10664 effective words) took 0.0s, 399328 effective words/s
2018-07-31 09:18:08,428 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:08,444 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:08,446 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:08,450 : INFO : EPOCH - 2 : training on 17642 raw words (10664 effective words) took 0.0s, 416904 effective words/s
2018-07-31 09:18:08,462 : INFO : worker

2018-07-31 09:18:08,745 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:08,748 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:08,749 : INFO : EPOCH - 11 : training on 17642 raw words (10664 effective words) took 0.0s, 461151 effective words/s
2018-07-31 09:18:08,760 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:08,779 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:08,787 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:08,788 : INFO : EPOCH - 12 : training on 17642 raw words (10692 effective words) took 0.0s, 346470 effective words/s
2018-07-31 09:18:08,798 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:08,814 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:08,817 : INFO : worker thread finished; awaiting finish of 0 more threads
2018

2018-07-31 09:18:09,100 : INFO : EPOCH - 21 : training on 17642 raw words (10690 effective words) took 0.0s, 369960 effective words/s
2018-07-31 09:18:09,114 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:09,131 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:09,136 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:09,138 : INFO : EPOCH - 22 : training on 17642 raw words (10688 effective words) took 0.0s, 389994 effective words/s
2018-07-31 09:18:09,149 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:09,167 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:09,169 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:09,171 : INFO : EPOCH - 23 : training on 17642 raw words (10684 effective words) took 0.0s, 414017 effective words/s
2018-07-31 09:18:09,188 : INFO : worker thread finis

2018-07-31 09:18:09,465 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:09,480 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:09,483 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:09,486 : INFO : EPOCH - 32 : training on 17642 raw words (10718 effective words) took 0.0s, 432662 effective words/s
2018-07-31 09:18:09,507 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:09,509 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:09,510 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:09,511 : INFO : EPOCH - 33 : training on 17642 raw words (10636 effective words) took 0.0s, 605725 effective words/s
2018-07-31 09:18:09,521 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:09,535 : INFO : worker thread finished; awaiting finish of 1 more threads
2018

2018-07-31 09:18:09,788 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:09,790 : INFO : EPOCH - 42 : training on 17642 raw words (10691 effective words) took 0.0s, 573033 effective words/s
2018-07-31 09:18:09,801 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:09,814 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:09,816 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:09,818 : INFO : EPOCH - 43 : training on 17642 raw words (10670 effective words) took 0.0s, 526501 effective words/s
2018-07-31 09:18:09,829 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:09,844 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:09,846 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:09,849 : INFO : EPOCH - 44 : training on 17642 raw words (10673 effective word

2018-07-31 09:18:10,101 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:10,104 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:10,105 : INFO : EPOCH - 53 : training on 17642 raw words (10673 effective words) took 0.0s, 580981 effective words/s
2018-07-31 09:18:10,114 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:10,129 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:10,131 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:10,132 : INFO : EPOCH - 54 : training on 17642 raw words (10689 effective words) took 0.0s, 500009 effective words/s
2018-07-31 09:18:10,142 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:10,154 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:10,156 : INFO : worker thread finished; awaiting finish of 0 more threads
2018

2018-07-31 09:18:10,428 : INFO : EPOCH - 63 : training on 17642 raw words (10682 effective words) took 0.0s, 441882 effective words/s
2018-07-31 09:18:10,440 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:10,456 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:10,458 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:10,459 : INFO : EPOCH - 64 : training on 17642 raw words (10676 effective words) took 0.0s, 505066 effective words/s
2018-07-31 09:18:10,470 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:10,487 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:10,489 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:10,492 : INFO : EPOCH - 65 : training on 17642 raw words (10661 effective words) took 0.0s, 431738 effective words/s
2018-07-31 09:18:10,509 : INFO : worker thread finis

2018-07-31 09:18:10,799 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:10,818 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:10,822 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:10,826 : INFO : EPOCH - 74 : training on 17642 raw words (10666 effective words) took 0.0s, 360507 effective words/s
2018-07-31 09:18:10,842 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:10,854 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:10,856 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:10,860 : INFO : EPOCH - 75 : training on 17642 raw words (10604 effective words) took 0.0s, 410579 effective words/s
2018-07-31 09:18:10,872 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:10,891 : INFO : worker thread finished; awaiting finish of 1 more threads
2018


2018-07-31 09:18:11,184 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:11,195 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:11,196 : INFO : EPOCH - 84 : training on 17642 raw words (10683 effective words) took 0.0s, 382860 effective words/s
2018-07-31 09:18:11,211 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:11,224 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:11,226 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:11,228 : INFO : EPOCH - 85 : training on 17642 raw words (10680 effective words) took 0.0s, 435063 effective words/s
2018-07-31 09:18:11,241 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:11,256 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:11,261 : INFO : worker thread finished; awaiting finish of 0 more threads
201

2018-07-31 09:18:11,546 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:11,547 : INFO : EPOCH - 94 : training on 17642 raw words (10637 effective words) took 0.0s, 384050 effective words/s
2018-07-31 09:18:11,558 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:11,573 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:11,577 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:11,579 : INFO : EPOCH - 95 : training on 17642 raw words (10630 effective words) took 0.0s, 464667 effective words/s
2018-07-31 09:18:11,594 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-31 09:18:11,608 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-31 09:18:11,609 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-31 09:18:11,613 : INFO : EPOCH - 96 : training on 17642 raw words (10682 effective word

(1066982, 1764200)

In [29]:
model.wv.similar_by_word('computer')

  if np.issubdtype(vec.dtype, np.int):


[('science', 0.8295746445655823),
 ('fundamentals', 0.7870364785194397),
 ('individual', 0.7484782934188843),
 ('provides', 0.703237771987915),
 ('cohort', 0.683079183101654),
 ('solving', 0.6586520671844482),
 ('CS', 0.6579200625419617),
 ('certificate', 0.6513158082962036),
 ('program', 0.6393243074417114),
 ('fundamental', 0.6211526393890381)]

In [26]:
model.wv.similar_by_word('node')

  if np.issubdtype(vec.dtype, np.int):


[('express', 0.9912917613983154),
 ('redux', 0.9811919331550598),
 ('native', 0.9477207660675049),
 ('mocha', 0.9438780546188354),
 ('mongoDB', 0.9083253741264343),
 ('chai', 0.8700175285339355),
 ('react', 0.805788516998291),
 ('python', 0.7759115099906921),
 ('authentication', 0.7734363079071045),
 ('course', 0.7592689990997314)]