first

llSourcell · Jun 5, 2016 · 06b18b4 · 06b18b4
commit 06b18b4
Show file tree

Hide file tree

Showing 10 changed files with 1,006 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,44 @@
+data/*
+log/*
+*.log
+
+# Compiled Lua sources
+luac.out
+
+# luarocks build files
+*.src.rock
+*.zip
+*.tar.gz
+
+# Object files
+*.o
+*.os
+*.ko
+*.obj
+*.elf
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+*.def
+*.exp
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
diff --git a/README.md b/README.md
@@ -0,0 +1,140 @@
+# Neural Conversational Model in Torch
+
+This is an attempt at implementing [Sequence to Sequence Learning with Neural Networks (seq2seq)](http://arxiv.org/abs/1409.3215) and reproducing the results in [A Neural Conversational Model](http://arxiv.org/abs/1506.05869) (aka the Google chatbot).
+
+The Google chatbot paper [became famous](http://www.sciencealert.com/google-s-ai-bot-thinks-the-purpose-of-life-is-to-live-forever) after cleverly answering a few philosophical questions, such as:
+
+> **Human:** What is the purpose of living?  
+> **Machine:** To live forever.
+
+## How it works
+
+The model is based on two [LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory) layers. One for encoding the input sentence into a "thought vector", and another for decoding that vector into a response. This model is called Sequence-to-sequence or seq2seq.
+
+![seq2seq](https://4.bp.blogspot.com/-aArS0l1pjHQ/Vjj71pKAaEI/AAAAAAAAAxE/Nvy1FSbD_Vs/s640/2TFstaticgraphic_alt-01.png)  
+_Source: http://googleresearch.blogspot.ca/2015/11/computer-respond-to-this-email.html_
+
+In this experiment, we train the seq2seq model with movie dialogs from the [Cornell Movie-Dialogs Corpus](http://www.mpi-sws.org/~cristian/Cornell_Movie-Dialogs_Corpus.html). The lines are shortened to the first sentence.
+
+## Sample conversation
+
+Here's a sample conversation after training for 20 epoch with 50000 examples, using the following command:
+
+```sh
+th train.lua --cuda --dataset 50000 --hiddenSize 1000
+```
+
+(Took 3 days to train on my GeForce GTX 780M.)
+
+For OpenCL, use `--opencl` instead of `--cuda`. To train on CPU, don't provide any of those two.
+
+> **me:** Hello?
+> **bot:** Hi.
+> 
+> **me:** How are you?
+> **bot:** I'm fine.
+> 
+> **me:** What's your name?
+> **bot:** It's hard to describe.
+> 
+> **me:** How so?
+> **bot:** I'm not sure.
+> 
+> **me:** What color is the sky?
+> **bot:** It's blue.
+> 
+> **me:** What is your job?
+> **bot:** It's not that i'm a fucking werewolf!
+> 
+> **me:** What is the purpose of life?
+> **bot:** A gift.
+> 
+> **me:** Are you intelligent?
+> **bot:** Yes, well...
+> 
+> **me:** Are you a machine?
+> **bot:** That's a lie.
+> 
+> **me:** Are you human?
+> **bot:** No, i'm not.
+> 
+> **me:** What are you?
+> **bot:** I'm not sure.
+>
+> **me:** Do you plan on taking over the world?
+> **bot:** No, i don't.
+
+Phew! That was close. Good thing I didn't train it on the full dataset. Please experiment responsibly.
+
+_(Disclaimer: nonsensical responses have been removed.)_
+
+## Installing
+
+1. [Install Torch](http://torch.ch/docs/getting-started.html).
+2. Install the following additional Lua libs:
+
+   ```sh
+   luarocks install nn
+   luarocks install rnn
+   luarocks install penlight
+   ```
+
+   To train with CUDA install the latest CUDA drivers, toolkit and run:
+
+   ```sh
+   luarocks install cutorch
+   luarocks install cunn
+   ```
+
+   To train with opencl install the lastest Opencl torch lib:
+
+   ```sh
+   luarocks install cltorch
+   luarocks install clnn
+   ```
+
+3. Download the [Cornell Movie-Dialogs Corpus](http://www.mpi-sws.org/~cristian/Cornell_Movie-Dialogs_Corpus.html) and extract all the files into data/cornell_movie_dialogs.
+
+## Training
+
+```sh
+th train.lua [-h / options]
+```
+
+Use the `--dataset NUMBER` option to control the size of the dataset. Training on the full dataset takes about 5h for a single epoch.
+
+The model will be saved to `data/model.t7` after each epoch if it has improved (error decreased).
+
+## Testing
+
+To load the model and have a conversation:
+
+```sh
+th -i eval.lua --cuda # Skip --cuda if you didn't train with it
+# ...
+th> say "Hello."
+```
+
+## License
+
+MIT License
+
+Copyright (c) 2016 Marc-Andre Cournoyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/cornell_movie_dialogs.lua b/cornell_movie_dialogs.lua
@@ -0,0 +1,72 @@
+local CornellMovieDialogs = torch.class("neuralconvo.CornellMovieDialogs")
+local stringx = require "pl.stringx"
+local xlua = require "xlua"
+
+local function parsedLines(file, fields)
+  local f = assert(io.open(file, 'r'))
+
+  return function()
+    local line = f:read("*line")
+
+    if line == nil then
+      f:close()
+      return
+    end
+
+    local values = stringx.split(line, " +++$+++ ")
+    local t = {}
+
+    for i,field in ipairs(fields) do
+      t[field] = values[i]
+    end
+
+    return t
+  end
+end
+
+function CornellMovieDialogs:__init(dir)
+  self.dir = dir
+end
+
+local MOVIE_LINES_FIELDS = {"lineID","characterID","movieID","character","text"}
+local MOVIE_CONVERSATIONS_FIELDS = {"character1ID","character2ID","movieID","utteranceIDs"}
+local TOTAL_LINES = 387810
+
+local function progress(c)
+  if c % 10000 == 0 then
+    xlua.progress(c, TOTAL_LINES)
+  end
+end
+
+function CornellMovieDialogs:load()
+  local lines = {}
+  local conversations = {}
+  local count = 0
+
+  print("-- Parsing Cornell movie dialogs data set ...")
+
+  for line in parsedLines(self.dir .. "/movie_lines.txt", MOVIE_LINES_FIELDS) do
+    lines[line.lineID] = line
+    line.lineID = nil
+    -- Remove unused fields
+    line.characterID = nil
+    line.movieID = nil
+    count = count + 1
+    progress(count)
+  end
+
+  for conv in parsedLines(self.dir .. "/movie_conversations.txt", MOVIE_CONVERSATIONS_FIELDS) do
+    local conversation = {}
+    local lineIDs = stringx.split(conv.utteranceIDs:sub(3, -3), "', '")
+    for i,lineID in ipairs(lineIDs) do
+      table.insert(conversation, lines[lineID])
+    end
+    table.insert(conversations, conversation)
+    count = count + 1
+    progress(count)
+  end
+
+  xlua.progress(TOTAL_LINES, TOTAL_LINES)
+
+  return conversations
+end