In [1]:
require 'torch'
require 'nn'
require 'json'

In [2]:
-- read from json file
function readJSON(file)
  local f = assert(io.open(file, 'r'))
  local tJSON = f:read('*all')
  f:close()
  return json.decode(tJSON)
end

In [3]:
content = readJSON("data.json")
-- the sample data has 3 sentences
data = content['data']
print(data)

{
  1 : this is sentence one
  2 : sentence two
  3 : final sentence, sentence three
}


In [4]:
-- creating look-up table for indexing
function createLookupTable(data)
    word_to_index = {}
    count = 0
    for i,sentence in ipairs(data) do
        for word in sentence:gmatch("%w+") do
            if word_to_index[word] == nil then
               word_to_index[word] = count
               count = count + 1
            end
        end
    end
    return word_to_index
end

In [5]:
word_to_index = createLookupTable(data)
print(word_to_index)

{
  this : 0
  final : 5
  is : 1


  two : 4
  three : 6
  one : 3
  sentence : 2
}


In [6]:
function index(sentence,lookupTable)
    vector = {}
    for word in sentence:gmatch("%w+") do
        table.insert(vector,lookupTable[word])
    end
    return vector
end

In [7]:
index(data[3],word_to_index)  --creating index for 3rd sentence

{
  1 : 5


  2 : 2
  3 : 2
  4 : 6
}


In [8]:
-- bag of words - one hot encoding
function onehot(sentence,lookupTable)
    table_size = 0
    for _ in pairs(lookupTable) do table_size = table_size + 1 end
    x = torch.LongTensor(1,table_size):zero()
    s = x:storage()
    for word in sentence:gmatch("%w+") do
        s[lookupTable[word]] = 1
    end
    return x
end

In [9]:
onehot(data[3],word_to_index)    --creating one hot for 3rd sentence

 0  1  0  0  1  1  0
[torch.LongTensor of size 1x7]



In [10]:
-- bag of words - count vectorization
function count(sentence,lookupTable)
    table_size = 0
    for _ in pairs(lookupTable) do table_size = table_size + 1 end
    x = torch.LongTensor(1,table_size):zero()
    s = x:storage()
    for word in sentence:gmatch("%w+") do
        s[lookupTable[word]] =  s[lookupTable[word]] + 1
    end
    return x
end

In [11]:
count(data[3],word_to_index)     --creating count vector for 3rd sentence

 0  2  0  0  1  1  0
[torch.LongTensor of size 1x7]

