In [1]:
import pandas as pd
import numpy as np
import torch
import time
import string

from collections import Counter

# Задание 1

## 1.1. Подготовка данных

In [2]:
df = pd.read_csv('simpsons_script_lines.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158271 entries, 0 to 158270
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  158271 non-null  int64  
 1   episode_id          158271 non-null  int64  
 2   number              158271 non-null  int64  
 3   raw_text            158271 non-null  object 
 4   timestamp_in_ms     158271 non-null  object 
 5   speaking_line       158271 non-null  object 
 6   character_id        140750 non-null  object 
 7   location_id         157864 non-null  float64
 8   raw_character_text  140749 non-null  object 
 9   raw_location_text   157863 non-null  object 
 10  spoken_words        132112 non-null  object 
 11  normalized_text     132087 non-null  object 
 12  word_count          132112 non-null  object 
dtypes: float64(1), int64(3), object(9)
memory usage: 15.7+ MB


  df = pd.read_csv('simpsons_script_lines.csv')


In [3]:
df = df[df['normalized_text'].notna()]
df.head()

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40.0,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33


In [4]:
letters = string.ascii_lowercase
letters_count = len(letters)
first_letter = ord('a')

def get_cipher_text(text, shift):
  shift = shift % letters_count
  res = ''
  for char in text:
    if char.isalpha():
      res += chr(first_letter + (ord(char) - first_letter + shift) % letters_count)
    else:
      res += char

  return res

In [35]:
shift = 3 # значение сдвига

In [36]:
df['cipher_text'] = df['normalized_text'].apply(lambda x: get_cipher_text(x, shift))
df['cipher_text'][:5]

Unnamed: 0,cipher_text
0,qr dfwxdoob lw zdv d olwwoh ri erwk vrphwlphv ...
1,zkhuhv pu ehujvwurp
2,l grqw nqrz dowkrxjk lg vxuh olnh wr wdon wr k...
3,wkdw olih lv zruwk olylqj
4,wkh sroov zloo eh rshq iurp qrz xqwlo wkh hqg ...


In [37]:
CHARS = set('abcdefghijklmnopqrstuvwxyz ')
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}
MAX_LEN = 50

def get_tensor(phrases):
  text = [[c for c in ph] for ph in phrases if type(ph) is str]

  X = torch.zeros((len(text), MAX_LEN), dtype=int)
  for i in range(len(text)):
    for j, w in enumerate(text[i]):
        if j >= MAX_LEN: break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

  return X

In [38]:
X = get_tensor(df['cipher_text'].tolist()) # зашифрованный текст (символы сдвинуты на 3)
y = get_tensor(df['normalized_text'].tolist()) # исходный текст

## 1.2. Обучение модели

In [39]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embedding = torch.nn.Embedding(28, 30)
        self.rnn = torch.nn.RNN(30, 128)
        self.out = torch.nn.Linear(128, 28)

    def forward(self, sentences, state=None):
        x = self.embedding(sentences)
        x, s = self.rnn(x)
        return self.out(x)

In [40]:
model = Network()

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=.05)

In [41]:
for ep in range(3):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X) / 100)):
        X_batch = X[i * 100:(i + 1) * 100]
        Y_batch = y[i * 100:(i + 1) * 100].flatten()

        optimizer.zero_grad()
        answers = model.forward(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 53.250, Train loss: 0.203
Epoch 1. Time: 53.523, Train loss: 0.015
Epoch 2. Time: 53.413, Train loss: 0.009


In [51]:
def generate_sentence(word):
    sentence = list(word.lower())
    sentence = [CHAR_TO_INDEX.get(s, 0) for s in sentence]

    answers = model.forward(torch.tensor(sentence))
    probas, indices = answers.topk(1)
    return ''.join([INDEX_TO_CHAR[ind.item()] for ind in indices.flatten()])

Проверка на уже знакомых моделе данных:

In [52]:
i = 1

print(f"Исходный текст: {df['normalized_text'][i]}")
print(f"Зашифрованный текст: {df['cipher_text'][i]}")
print(f"Результат работы модели: {generate_sentence(df['cipher_text'][i])}")

Исходный текст: wheres mr bergstrom
Зашифрованный текст: zkhuhv pu ehujvwurp
Результат работы модели: wheres mr bergstrom


И новая фраза для примера:

In [57]:
text = 'everyone has the right to be like themselves'

print(f"Исходный текст: {text}")
print(f"Зашифрованный текст: {get_cipher_text(text, shift)}")
print(f"Результат работы модели: {generate_sentence(get_cipher_text(text, shift))}")

Исходный текст: everyone has the right to be like themselves
Зашифрованный текст: hyhubrqh kdv wkh uljkw wr eh olnh wkhpvhoyhv
Результат работы модели: everyone has the right to be like themselves


# Задание 2

In [6]:
all_words = []
for text in df['normalized_text']:
  all_words += text.split()

vocabulary = Counter(all_words)

dict_word_to_index = {}
dict_index_to_word = {}
for index, key in enumerate(vocabulary):
  dict_word_to_index[key] = index
  dict_index_to_word[index] = key

all_words_index = [dict_word_to_index[i] for i in all_words]

In [7]:
class My_network(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_size):
        super(My_network, self).__init__()
        self.embedding = torch.nn.Embedding(num_embeddings, embedding_size)
        self.rnn = torch.nn.RNN(embedding_size, 128, batch_first=True)
        self.out = torch.nn.Linear(128, num_embeddings)

    def forward(self, sentences, state):
        x = self.embedding(sentences)
        x, s = self.rnn(x, state)
        return self.out(x), s

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [13]:
batch_size = 100
sequential_size = 10
embedding_size = 64
num_embeddings = len(vocabulary)

model = My_network(num_embeddings, embedding_size)
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.001)

In [14]:
for ep in range(10):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    model.train()
    state = torch.zeros(1, batch_size, 128).to(device)

    batches_counts = int(len(all_words_index) / (batch_size*sequential_size))
    X = all_words_index[:batches_counts*batch_size*sequential_size]
    y = np.zeros_like(X)
    y[:-1] = X[1:]
    y[-1] = X[0]

    X = np.reshape(X, (batches_counts*batch_size, sequential_size))
    y = np.reshape(y, (batches_counts*batch_size, sequential_size))

    for i in range(int(len(all_words_index) / batch_size)):
        X_batch = torch.tensor(X[i:i+batch_size, :]).to(device)
        Y_batch = torch.tensor(y[i:i+batch_size, :]).to(device)

        optimizer.zero_grad()
        answers, state = model.forward(X_batch, state)

        loss = criterion(answers.transpose(1, 2), Y_batch)
        state = state.detach()
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 369.499, Train loss: 2.245
Epoch 1. Time: 368.137, Train loss: 2.224
Epoch 2. Time: 367.854, Train loss: 2.206
Epoch 3. Time: 367.379, Train loss: 2.126
Epoch 4. Time: 367.181, Train loss: 2.049
Epoch 5. Time: 366.864, Train loss: 1.988
Epoch 6. Time: 366.790, Train loss: 1.911
Epoch 7. Time: 366.714, Train loss: 1.860
Epoch 8. Time: 366.659, Train loss: 1.818
Epoch 9. Time: 366.627, Train loss: 1.763


In [64]:
def generate_sentence(model, list_of_words, dict_word_to_index, dict_index_to_word, seq_size=10):
  model.eval()
  state = torch.zeros(1, 1, 128).to(device)

  for word in list_of_words:
    w = torch.tensor([[dict_word_to_index[word]]]).to(device)
    out, state = model(w, state)

  next_word = out[0].topk(1)[1]
  list_of_words.append(dict_index_to_word[next_word.item()])

  for i in range(seq_size):
    w = torch.tensor([[next_word]]).to(device)
    out, state = model(w, state)

    next_word = out[0].topk(1)[1]
    list_of_words.append(dict_index_to_word[next_word.item()])

  return ' '.join(list_of_words)

In [60]:
list_of_words = ['that', 'life']
generate_sentence(model, list_of_words, dict_word_to_index, dict_index_to_word)

'that life i feel like if it were a cartoon come on marge'

In [63]:
list_of_words = ['i', 'dont', 'know']
generate_sentence(model, list_of_words, dict_word_to_index, dict_index_to_word)

'i dont know if this is just the beginning ooh our less gifted employees'