In [9]:
import torch
import os

corpus_name = 'cornell movie-dialogs corpus'
corpus = os.path.join('./data', corpus_name)

In [2]:
print(corpus)

./data/cornell movie-dialogs corpus


In [3]:
def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

In [4]:
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")
# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)


Sample lines from file:
b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\n"
b'Why?\tUnso

In [5]:
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token


class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [6]:
import unicodedata
import re 

MAX_LENGTH = 10  # Maximum sentence length to consider

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using filterPair condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 221282 sentence pairs
Trimmed to 64271 sentence pairs
Counting words...
Counted words: 18008

pairs:
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [7]:
MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 7823 / 18005 = 0.4345
Trimmed from 64271 pairs to 53165, 0.8272 of total


In [10]:
import random
import itertools

def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))
####zip_longest(*[[あ,い,う,え,お],[か,き,く,け,こ,さ]])→ (あ,か)(い,き)(う,く)(え,け)(お,こ)(pad,さ)　にしてくれる


def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)#Byte?
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

# print("input_variable:", input_variable)
# print("lengths:", lengths)
# print("target_variable:", target_variable)
# print("mask:", mask)
# print("max_target_len:", max_target_len)

In [11]:
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
pairs = trimRareWords(voc, pairs, MIN_COUNT)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 221282 sentence pairs
Trimmed to 64271 sentence pairs
Counting words...
Counted words: 18008
keep_words 7823 / 18005 = 0.4345
Trimmed from 64271 pairs to 53165, 0.8272 of total

pairs:
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']
['wow', 'let s go .']


In [12]:
type(pairs)

list

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np

In [24]:
train_pair,test_pair = train_test_split(pairs ,test_size=0.2)

In [26]:
print(len(train_pair),len(test_pair))

42532 10633


In [None]:
import torch.nn as nn


class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(
            hidden_size,
            hidden_size,
            n_layers,
            dropout=(0 if n_layers == 1 else dropout),
            bidirectional=True,
        )

    def forward(self, input_seq, input_lengths, hidden=None):

In [153]:
import torch.nn as nn
import torch.nn.functional as F

class net(nn.Module):
    def __init__(self):
        super(net,self).__init__()
        #self.fc1 = nn.Linear(16,8)
        self.fc2 = nn.Linear(8,4)
        self.dropout = nn.Dropout(p=0.0)
        self.fc3 = nn.Linear(4,1)
    
    def forward(self, x):
        #x = F.relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        print(x)
        x = self.dropout(x)
        print(x)
        x = self.fc3(x)
        
        return x
        

In [154]:
import copy
model1 = net()
model2 = copy.deepcopy(model1)

In [155]:
print(list(model1.parameters()) )
print()
print(list(model2.parameters()) )

[Parameter containing:
tensor([[ 0.0970, -0.1958, -0.2963,  0.2747, -0.1513,  0.0121, -0.0785, -0.0550],
        [ 0.1266, -0.2362,  0.2394,  0.2484,  0.1562,  0.3189,  0.2477,  0.2006],
        [ 0.2438,  0.2632, -0.2070, -0.0879, -0.1918, -0.3018,  0.1680, -0.2655],
        [-0.2098, -0.1241,  0.0870,  0.1772, -0.2751,  0.1668,  0.1980, -0.1823]],
       requires_grad=True), Parameter containing:
tensor([-0.1614,  0.2714,  0.1185, -0.1376], requires_grad=True), Parameter containing:
tensor([[-0.4403,  0.4773, -0.1715,  0.4171]], requires_grad=True), Parameter containing:
tensor([-0.2298], requires_grad=True)]

[Parameter containing:
tensor([[ 0.0970, -0.1958, -0.2963,  0.2747, -0.1513,  0.0121, -0.0785, -0.0550],
        [ 0.1266, -0.2362,  0.2394,  0.2484,  0.1562,  0.3189,  0.2477,  0.2006],
        [ 0.2438,  0.2632, -0.2070, -0.0879, -0.1918, -0.3018,  0.1680, -0.2655],
        [-0.2098, -0.1241,  0.0870,  0.1772, -0.2751,  0.1668,  0.1980, -0.1823]],
       requires_grad=True), 

In [156]:
import torch

x = torch.FloatTensor([[1,2,3,4,5,6,7,9]])
target = torch.FloatTensor([[0.5]])

In [157]:
import torch.optim as optim
output = model1(x)
print(output)
criterion = nn.MSELoss()
opt = optim.SGD(model1.parameters(), lr=0.01)
model1.train()

tensor([[-1.9746e-02,  7.8712e+00, -4.0677e-02, -2.5516e-03]],
       grad_fn=<LeakyReluBackward0>)
tensor([[-1.9746e-02,  7.8712e+00, -4.0677e-02, -2.5516e-03]],
       grad_fn=<LeakyReluBackward0>)
tensor([[3.5420]], grad_fn=<AddmmBackward>)


net(
  (fc2): Linear(in_features=8, out_features=4, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
  (fc3): Linear(in_features=4, out_features=1, bias=True)
)

In [158]:
model1.zero_grad() 
print(list(model1.parameters()) )

loss = criterion(output, target)
print()
print(loss)
print()
loss.backward()
opt.step()
opt.zero_grad()
print(list(model1.parameters()) )

[Parameter containing:
tensor([[ 0.0970, -0.1958, -0.2963,  0.2747, -0.1513,  0.0121, -0.0785, -0.0550],
        [ 0.1266, -0.2362,  0.2394,  0.2484,  0.1562,  0.3189,  0.2477,  0.2006],
        [ 0.2438,  0.2632, -0.2070, -0.0879, -0.1918, -0.3018,  0.1680, -0.2655],
        [-0.2098, -0.1241,  0.0870,  0.1772, -0.2751,  0.1668,  0.1980, -0.1823]],
       requires_grad=True), Parameter containing:
tensor([-0.1614,  0.2714,  0.1185, -0.1376], requires_grad=True), Parameter containing:
tensor([[-0.4403,  0.4773, -0.1715,  0.4171]], requires_grad=True), Parameter containing:
tensor([-0.2298], requires_grad=True)]

tensor(9.2537, grad_fn=<MseLossBackward>)

[Parameter containing:
tensor([[ 0.0973, -0.1953, -0.2955,  0.2758, -0.1500,  0.0137, -0.0766, -0.0526],
        [ 0.0975, -0.2942,  0.1523,  0.1322,  0.0110,  0.1447,  0.0444, -0.0608],
        [ 0.2439,  0.2635, -0.2067, -0.0875, -0.1913, -0.3012,  0.1687, -0.2646],
        [-0.2100, -0.1246,  0.0862,  0.1762, -0.2764,  0.1653,  0.19

In [160]:
model2.train()
output = model2(x)
print(output)
criterion = nn.MSELoss()
opt = optim.SGD(model2.parameters(), lr=0.01)

with torch.no_grad():
    model2.eval()
    output1 = model2(x)
    print(output1)
up = output1.item()
output2 = output1+output
print(output2)
target = target + up
print(target)

tensor([[-1.9746e-02,  7.8712e+00, -4.0677e-02, -2.5516e-03]],
       grad_fn=<LeakyReluBackward0>)
tensor([[-1.9746e-02,  7.8712e+00, -4.0677e-02, -2.5516e-03]],
       grad_fn=<LeakyReluBackward0>)
tensor([[3.5420]], grad_fn=<AddmmBackward>)
tensor([[-1.9746e-02,  7.8712e+00, -4.0677e-02, -2.5516e-03]])
tensor([[-1.9746e-02,  7.8712e+00, -4.0677e-02, -2.5516e-03]])
tensor([[3.5420]])
tensor([[7.0840]], grad_fn=<AddBackward0>)
tensor([[4.0420]])


In [161]:
model2.zero_grad() 
print(list(model2.parameters()) )

loss = criterion(output2, target)
print()
print(loss)
print()
loss.backward()
opt.step()
opt.zero_grad()
print(list(model2.parameters()) )

[Parameter containing:
tensor([[ 0.0970, -0.1958, -0.2963,  0.2747, -0.1513,  0.0121, -0.0785, -0.0550],
        [ 0.1266, -0.2362,  0.2394,  0.2484,  0.1562,  0.3189,  0.2477,  0.2006],
        [ 0.2438,  0.2632, -0.2070, -0.0879, -0.1918, -0.3018,  0.1680, -0.2655],
        [-0.2098, -0.1241,  0.0870,  0.1772, -0.2751,  0.1668,  0.1980, -0.1823]],
       requires_grad=True), Parameter containing:
tensor([-0.1614,  0.2714,  0.1185, -0.1376], requires_grad=True), Parameter containing:
tensor([[-0.4403,  0.4773, -0.1715,  0.4171]], requires_grad=True), Parameter containing:
tensor([-0.2298], requires_grad=True)]

tensor(9.2537, grad_fn=<MseLossBackward>)

[Parameter containing:
tensor([[ 0.0973, -0.1953, -0.2955,  0.2758, -0.1500,  0.0137, -0.0766, -0.0526],
        [ 0.0975, -0.2942,  0.1523,  0.1322,  0.0110,  0.1447,  0.0444, -0.0608],
        [ 0.2439,  0.2635, -0.2067, -0.0875, -0.1913, -0.3012,  0.1687, -0.2646],
        [-0.2100, -0.1246,  0.0862,  0.1762, -0.2764,  0.1653,  0.19

In [164]:
#print(list(model2.parameters()) )

In [165]:
#print(list(model1.parameters()) )

In [172]:
xx = torch.FloatTensor([[[1,1,1,1],[2,2,2,2],[3,3,3,3]],
                        [[1,1,1,1],[2,2,2,2],[3,3,3,3]],
                        [[1,1,1,1],[2,2,2,2],[3,3,3,3]]])
target = torch.FloatTensor([[[3,3,3,3],[4,4,4,4],[6,6,6,6]],
                            [[2,2,2,2],[3,3,3,3],[4,4,4,4]],
                            [[1,1,1,1],[2,2,2,2],[3,3,3,3]]])
mse = F.mse_loss(xx, target,reduction='mean')
print(mse)

tensor(2.2222)


In [None]:
nn.MSELoss()