In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

In [4]:
# Download the file we just uploaded.
#
# Replace the assignment below with your file ID
# to download a different file.
#
# A file ID looks like: 1uBtlaggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1j08fBkO4VVq5uz33L2KwZf9xgtLAKt3b'

import io
from googleapiclient.http import MediaIoBaseDownload

request = drive_service.files().get_media(fileId=file_id)
downloaded = io.BytesIO()
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
  # _ is a placeholder for a progress object that we ignore.
  # (Our file is small, so we skip reporting progress.)
  _, done = downloader.next_chunk()

downloaded.seek(0)
#print('Downloaded file contents are: {}'.format(downloaded.read()))

0

In [0]:
import zipfile
import io
zf = zipfile.ZipFile(downloaded, "r")
zf.extractall()

In [39]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving selected_movies.txt to selected_movies.txt
User uploaded file "selected_movies.txt" with length 2495 bytes


In [0]:
import os, sys
import re

# hardcoded paths - change if necessary
root = ''

# this one you need to download from the dataset
full_dataset = 'cornell movie-dialogs corpus/movie_lines.txt'

output_destination = 'selected_conversations.txt'
movie_selection = 'selected_movies.txt'

# separator used in the original dataset
separator = ' +++$+++ '

# movie ID file
MOVIE_ID = 0

# full conversation dataset file
MOVIE_ID_FULL = 2
# reverse indexing
CHARACTER_NAME = -2
CHARACTER_LINE = -1

# keep just these characters for simplicity (and utf8 breaking)
repl = r'[^A-Za-z0-9()\,!\?\'\`\. ]'


# regex replace
def filter(string):
    return re.sub(repl, '', string)


# from a movie ID string (e.g. M134), output the number (134)
def number_from_id(id):
    return int(id[1:])


# read just movie ID's, rest is for readability
def read_selected(path_to_selected_movies):
    selected_movies = set()

    with open(path_to_selected_movies, 'r') as infile:
        for line in infile:
            parts = line.strip().split(separator)
            selected_movies.add(parts[MOVIE_ID].strip())
    return selected_movies


# select and write to output file
def select_and_write(path_to_full_dataset, path_to_output, selected_movies):
    movies = {}

    with open(path_to_full_dataset, 'r', encoding="ISO-8859-1") as infile:

        for line in infile:

            parts = line.strip().split(separator)

            if parts[MOVIE_ID_FULL].strip() not in selected_movies:
                continue

            # take data and transform to tuple
            ID = parts[MOVIE_ID_FULL]
            char_name = parts[CHARACTER_NAME]
            char_line = parts[CHARACTER_LINE]

            tup = (number_from_id(ID), char_name, char_line)

            # add to map
            if ID not in movies:
                movies[ID] = []
            movies[ID].append(tup)

    with open(path_to_output, 'w') as out:
        for movie in movies:
            # sort by line number
            dialogue = sorted(movies[movie], key=lambda t: t[0])
            for n, name, text in dialogue:
                out.write(filter(name) + ':\n' + filter(text) + '\n\n')


def run():
    # uses hardcoded paths
    selection = os.path.join(root, movie_selection)
    selected_movies = read_selected(selection)

    dataset = os.path.join(root, full_dataset)
    output = os.path.join(root, output_destination)
    select_and_write(dataset, output, selected_movies)

run()

In [46]:
!ls

cornell movie-dialogs corpus  __MACOSX			  selected_movies.txt
datalab			      selected_conversations.txt


**Zadatak 1: učitavanje podataka i batching**

In [0]:
import numpy as np
from collections import Counter

class DatasetPreprocessing():
  
  def __init__(self, batch_size, sequence_length):
    self.batch_size = batch_size
    self.sequence_length = sequence_length
    self.sorted_chars = []
    
    self.batch_pointer = 0
    self.x_batches = []
    self.y_batches = []
    
  # ...
  # Code is nested in class definition, indentation is not representative.
  # "np" stands for numpy.

  def preprocess(self, input_file):
      with open(input_file, "r") as f:
          data = f.read()#.decode("utf-8") # python 2
        
      # count and sort most frequent characters
      char_counter = Counter(data)
      self.sorted_chars = [item[0] for item in char_counter.most_common()]
      print("Sorted chars: ", self.sorted_chars)
      
      # self.sorted chars contains just the characters ordered descending by frequency
      self.char2id = dict(zip(self.sorted_chars, range(len(self.sorted_chars)))) 
      
      # reverse the mapping
      self.id2char = {k:v for v,k in self.char2id.items()}
      # convert the data to ids
      self.x = np.array(list(map(self.char2id.get, data)))
        
      return self.x

  def encode(self, sequence):
      # returns the sequence encoded as integers
      return np.array(list(map(self.char2id.get, sequence)))

  def decode(self, encoded_sequence):
      # returns the sequence decoded as letters
      return np.array(list(map(self.id2char.get, encoded_sequence)))

  # ...
  # ...
  # Code is nested in class definition, indentation is not representative.

  def create_minibatches(self):
     
    self.num_batches = int(len(self.x) / (self.batch_size * self.sequence_length)) # calculate the number of batches
    self.x_batches = []
    self.y_batches = []

    # Is all the data going to be present in the batches? Why?
    # What happens if we select a batch size and sequence length larger than the length of the data?

    #######################################
    #       Convert data to batches       #
    #######################################

    for i in range(self.num_batches):
      x_batch = self.x[
          i * self.batch_size * self.sequence_length : 
          (i+1) * self.batch_size * self.sequence_length
      ]
      y_batch = self.x[(i+1) * self.batch_size * self.sequence_length]

      self.x_batches.append([x_batch])
      self.y_batches.append([y_batch])
      
  # ...
  # Code is nested in class definition, indentation is not representative.
  def next_minibatch(self):
    # ...

    batch_x, batch_y = None, None
    # handling batch pointer & reset
    # new_epoch is a boolean indicating if the batch pointer was reset
    # in this function call
    return new_epoch, batch_x, batch_y

In [115]:
# Module test
data_module = DatasetPreprocessing(3, 3)
x = data_module.preprocess('selected_conversations.txt')
data_module.create_minibatches()

Sorted chars:  [' ', 'e', 't', 'o', '\n', 'a', 'n', 'i', 'h', 's', 'r', '.', 'l', 'u', 'd', 'y', 'm', ':', 'g', 'I', 'w', 'E', 'c', "'", 'f', 'R', 'A', 'k', 'b', 'p', 'N', ',', 'T', 'D', 'O', 'Y', 'v', '?', 'H', 'L', 'M', 'S', 'W', 'C', 'F', 'B', 'K', 'U', 'G', '!', 'J', 'P', 'V', 'j', 'x', 'Z', 'z', 'q', '0', '1', '2', 'X', '4', '5', '7', 'Q', '3', '9', '8', '6', '`']


In [140]:
batch_size = 500
seq_len = 3
num_batches = len(x) // (batch_size * seq_len)
print("Number of batches: ", num_batches)
batch_x = []
batch_y = []
for i in range(num_batches):
  
  # Construct whole batch
  total_batch = x[i * batch_size * seq_len : (i + 1) * batch_size * seq_len]
  
  mini_batch_x = []
  mini_batch_y = []
  
  # print(i, ". - total batch len: ", len(total_batch))
  for batch_index in range(batch_size):
    mini_batch_x.append(total_batch[batch_index * seq_len : (batch_index + 1) * seq_len])
    mini_batch_y.append(total_batch[batch_index * seq_len + 1: (batch_index + 1) * seq_len + 50])
  
  batch_x.append(mini_batch_x)
  batch_y.append(mini_batch_y)

Number of batches:  394


In [141]:
batch_x

[[array([43, 34, 25]),
  array([30, 21, 39]),
  array([19, 47, 41]),
  array([17,  4, 19]),
  array([ 0, 22,  5]),
  array([ 6, 23,  2]),
  array([ 0, 29, 10]),
  array([1, 2, 1]),
  array([ 6, 14,  0]),
  array([2, 3, 0]),
  array([28,  1,  0]),
  array([15,  3, 13]),
  array([10,  0,  8]),
  array([13,  9, 28]),
  array([ 5,  6, 14]),
  array([11, 11, 11]),
  array([ 0, 33,  5]),
  array([36,  7, 14]),
  array([23,  9,  0]),
  array([7, 6, 0]),
  array([18, 10,  1]),
  array([5, 2, 0]),
  array([9, 8, 5]),
  array([29,  1, 11]),
  array([ 4,  4, 39]),
  array([21, 21, 39]),
  array([34, 34, 17]),
  array([ 4, 26, 27]),
  array([2, 5, 0]),
  array([14,  1, 14]),
  array([ 1, 10,  3]),
  array([0, 5, 6]),
  array([ 9,  7, 12]),
  array([ 5,  0, 14]),
  array([ 3,  0, 16]),
  array([ 1, 27,  2]),
  array([ 1,  2, 11]),
  array([ 4,  4, 43]),
  array([34, 25, 30]),
  array([21, 39, 19]),
  array([47, 41, 17]),
  array([ 4, 30,  3]),
  array([31,  0, 19]),
  array([23, 16,  0]),
  array([

In [0]:
batch_y