# Generative-Based Chatbot Using Sequence to Sequence Algorithm

In [1]:
import re
import os
import csv
import math
import random
import codecs
import itertools
import unicodedata

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import optim
from torch.jit import script, trace

## Load Datasets

In [2]:
corpus_name = 'cornell_movie_dialogs'
corpus = os.path.join('data', corpus_name)

In [3]:
def print_lines(file, n=10):
    with open(file, 'rb') as data:
        lines = data.readlines()
    for line in lines[:n]:
        print(line)

In [None]:
print_lines(os.path.join(corpus, 'movie_lines.txt'))

## Preprocess Datasets

In [6]:
def load_lines(file_name, fields):
    
    lines = {}
    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(' +++$+++ ')
            
            line_obj = {}
            for i, field in enumerate(fields):
                line_obj[field] = values[i]
            
            lines[line_obj['lineID']] = line_obj
    return lines

In [7]:
def load_conversations(file_name, lines, fields):
    
    conversations = []
    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            
            conv_obj = {}
            for i, field in enumerate(fields):
                conv_obj[field] = values[i]
                
            utterance_id_pattern = re.compile('L[0-9]+')
            line_ids = utterance_id_pattern.findall(conv_obj['utteranceIDs'])
            
            conv_obj['lines'] = []
            for line_id in line_ids:
                conv_obj['lines'].append(lines[line_id])
                
            conversations.append(conv_obj)
    return conversations

In [8]:
def extract_sentence_pairs(conversations):
    
    qa_pairs = []
    for conversation in conversations:
        for i in range(len(conversation['lines']) - 1):
            input_line = conversation['lines'][i]['text'].strip()
            target_line = conversation['lines'][i+1]['text'].strip()
            
            if input_line and target_line:
                qa_pairs.append([input_line, target_line])
                
    return qa_pairs

In [9]:
data_file = os.path.join(corpus, 'formatted_movie_lines.txt')

delimiter = '\t'
delimiter = str(codecs.decode(delimiter, 'unicode_escape'))

In [None]:
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ['lineID', 'characterID', 'movieID', 'character', 'text']
MOVIE_CONVERSATIONS_FIELDS = ['character1ID', 'character2ID', 'movieID', 'utteranceIDs']

print('\nProcessing corpus...')
lines = load_lines(os.path.join(corpus, 'movie_lines.txt'), MOVIE_LINES_FIELDS)

print('\nLoading conversations...')
conversations = load_conversations(os.path.join(corpus, 'movie_conversations.txt'),
                                   lines, MOVIE_CONVERSATIONS_FIELDS)

In [None]:
print('\nWriting newly formatted file...')
with open(data_file, 'w', encoding='utf-8') as output_file:
    writer = csv.writer(output_file, delimiter=delimiter, lineterminator='\n')
    for pair in extract_sentence_pairs(conversations):
        writer.writerow(pair)

In [None]:
print('\nSample lines from file:')
print_lines(data_file)

---