In [2]:
import json
import collections
import re
import os 
import wget

In [1]:
! pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9681 sha256=cb6fab97e1cfb4ac95579836ae870d97aea9824a0725c949dd14e8332b0e7ebb
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [3]:
url_data = 'https://raw.githubusercontent.com/caterinaLacerra/word2vec_data/master/wiki_10k.txt'
train_data_path=wget.download(url_data)

In [15]:
class Word2VecDataset():
    def __init__(self,txt_path,vocab_size,unk_token,window_size):
        self.window_size=window_size 
        self.data_words=self.read_data(txt_path)
        self.build_vocabulary(vocab_size,unk_token)
        """
        Args:
        txt_path(str): Path to the raw-txt file.
        vocab_size(int):Maximum amount of words that we want to embed.
        ink_token(str): how will unknown words represented (e.g. 'UNK').
        window_size(int): Number of words to consider as context.
        """
   
    def __iter__(self):
        words=self.data_words
        # each li is a list in words
        for li in words:
          len_li=len(li)

          for index in range(len_li):
            c_word=li[index]
            # this word sould be in a vovabulary
            if c_word in self.word2id:
              min_index=max(0,index -self.window_size)
              max_index=min(len_li,index +self.window-size)
          window_index=[j for j in range(min_index,max_index) if j !=index]
          for k in window_index:
            if li[k] in self.word2id:
              # index of k word in vocab
              predict_id=self.word2id[li[k]]
              # index of input word in vocab
              c_word_id=self.word2id[c_word]
              dic_ok={'predict': predict-id, 'inputs':c_word_id}

              yield dic_ok


    
    def read_data(self,txt_path):
      """converts each line in the input file into list of lists of tokenized words."""
      with open(txt_path) as f:
            data=[]
            total_words=0
            for i in f:
                split=self.tokenize(i) 
                if split:
                    data.append(split)
                    total_words+=len(split)
      return data

    # the pen is on the table--->['the','pen','is','on','the','table']
    def tokenize(self,i,pattern='\w+'):
      """tokenize a single line"""  
      return [word.lower() for word in re.compile(pattern).findall(i.lower()) if word]
      
        
    def build_vocabulary(self,vocab_size,unk_token):
      """ defines the vocabulary to be used. builds a mapping (words,index) for
      each word in the vocabulary.
      Args:
      vocab_size(int): size of the vocabulary
      unk_token(str): token to aassociate with unktoken words
      """
      counter_list=[]
      #contex is a list of tokens within a single sentence
      for context in self.data_words:
        counter_list.extend(context)
      counter=collections.Counter(counter_list)
      counter_len=len(counter)
      print('Number of distinct words: {}'.format(counter_len))

      # consider only the (vocab size-1) most commen words to build the vocab
      # dictionary with (word,index) 
      dictionary={k: i for i,(k, _) in enumerate(counter.most_common(vocab_size-1))}
      assert unk_token not in dictionary
      # all other words are mapped to UNK
      dictionary[unk_token]=vocab_size-1
      self.word2id=dictionary

      #dictionary with (word,frequency) pairs--including only words that are in the dictionary
      dict_counts={k: counter[k] for k in dictionary if k is not unk_token}
      self.frequency=dict_counts
      self.tot_occurrences=sum(dict_counts[k] for k in dict_counts)

      print('Total occurrences of words in dictionary:{}'.format(self.tot_occurrences))
      less_freq_word=min(dict_counts,key=counter.get)
      print('Less frequent word in dictionary appears {} times ({})'.format(dict_counts[less_freq_word],less_freq_word))

      #index to word
      self.id2word={v: k for k,v in dictionary.items()}

      #data is the text converted to indexes, as list of list

      data=[]
      # for each sentence 
      for i,v in enumerate(self.data_words):
        l=[]
        # for each word in the sentence
        for j,x in enumerate(self.data_words[i]):
          l.append(j)
        data.append(l)
      self.data_idx=data
      



        
       
        
        

In [16]:
dataset=Word2VecDataset(train_data_path,10000,'UNK',5)

Number of distinct words: 30501
Total occurrences of words in dictionary:228841
Less frequent word in dictionary appears 2 times (tasked)
