In [0]:
!pip install -q textgenrnn
from google.colab import files
from textgenrnn import textgenrnn
from datetime import datetime
import os
import re

Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
%tensorflow_version 1.x

In [0]:
model_cfg = {
    'word_level': False,   # set to True if want to train a word-level model (requires more data and smaller max_length)
    'rnn_size': 128,   # number of LSTM cells of each layer (128/256 recommended)
    'rnn_layers': 3,   # number of LSTM layers (>=2 recommended)
    'rnn_bidirectional': False,   # consider text both forwards and backward, can give a training boost
    'max_length': 30,   # number of tokens to consider before predicting the next (20-40 for characters, 5-10 for words recommended)
    'max_words': 10000,   # maximum number of words to model; the rest will be ignored (word-level model only)
}

train_cfg = {
    'line_delimited': True,   # set to True if each text has its own line in the source file
    'num_epochs': 15,   # set higher to train the model for longer
    'gen_epochs': 5,   # generates sample text from model after given number of epochs
    'train_size': 0.8,   # proportion of input data to train on: setting < 1.0 limits model from learning perfectly
    'dropout': 0.0,   # ignore a random proportion of source tokens each epoch, allowing model to generalize better
    'validation': False,   # If train__size < 1.0, test on holdout dataset; will make overall training slower
    'is_csv': False   # set to True if file is a CSV exported from Excel/BigQuery/pandas
}

Add Text

In [0]:
import pandas as pd
import numpy as np

In [0]:
dataset = pd.read_csv("office.csv")

In [0]:
dataset.head()

Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False


In [0]:
dataset.shape

(59909, 7)

In [0]:
corpus = []
for i in range(0,59908):
  if(dataset["speaker"][i]=='Michael'):
    d = dataset["line_text"][i]
    d=re.sub(r"\W+|_", " ", d)
    d = d.lower()
    print(d)
    corpus.append(d)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
meredith i would like you to pretend that you are from abu dhabi 
i am ashamed at your naked face i must cover it with my jacket covers meredith s face with his jacket you are now sexy in your culture 
i have my passport pats jacket pocket 
i have my per diem holds up money i already know what i am going to spend this on i am going to buy a sweater 
well i ll just i ll use different money for that 
and business class air like a five star hotel in the sky nothing but the best actually better than a five star hotel cause you get a big cushy seat and you sit in a row of people and to eat whatever the mind can imagine i think i am going to have a filet with mushroom sauce 
well that is all in the past 
they have one of those 
 in a singsong voice lets do this 
two are for souvenirs 
i do it s right here indicates to waist 
no i don t want to wear a bra 
stop it stop it 
okay where is my translator 
there he is 
yeah baby that

In [0]:
print(type(corpus))

<class 'list'>


In [0]:
with open('listfile.txt', 'w') as filehandle:
    for listitem in corpus:
        filehandle.write('%s,\n' % listitem)

Text Generation

In [0]:
file_name = "listfile.txt"
model_name = 'colaboratory'   # change to set file name of resulting trained models/texts

In [0]:
textgen = textgenrnn(name=model_name)

train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file

train_function(
    file_path=file_name,
    new_model=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=1024,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings=100,
    word_level=model_cfg['word_level'])

12,136 texts collected.
Training new model w/ 3-layer, 128-cell LSTMs
Training on 735,983 character sequences.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
####################
Temperature: 0.2
####################
i m not gonna be a good time i want to say that i want to see a lot of the first thing and i want to say the world i want to see you this is a second ,

i m not gonna be the door ,

i don t know what i m going to start and i want you to stop the world and then i want to do this ,

####################
Temperature: 0.5
####################
what s open to the time that s uh my heal ,

 singing a dwight the anicul and i don t know i m gonna say that i actually ,

oh ok you re all thing about it to your team and then he s beautiful for some sort of my bottle of back good ,

####################
Temperature: 1.0
####################
do i thret him ,

no i said there s a ppap doines make sudea that is your backs blowone ,

ok not we sat where i thright well we ll have in

In [0]:
textgen = textgenrnn(weights_path='colaboratory_weights.hdf5',
                       vocab_path='colaboratory_vocab.json',
                       config_path='colaboratory_config.json')

generated_characters = 30

textgen.generate_samples(30)

####################
Temperature: 0.2
####################
that s what i was thinking we re going to be a lot of the bathroom ,

 in the office who s the way they didn t have to start and then i was thinking what i was thinking ,

and the same time to stay and i am going to start and then i was thinking about my office is that so i m sorry i m sorry i m sorry i m going to be a lot of pam s head ,

i don t know i know i know i know i know i know what i want you to do we re going to say that s a good time to say that s a million dollar and the same time i was thinking about the way the most important the new company ,

okay ,

yeah i m going to take that ,

okay ,

i don t know i m going to wait to the same thing is a big day i was just sorry i m not going to start and then the same time ,

you know what i m going to take that what is that ,

i don t know i know i know i know i know i know i know i know i know i know i know i know i know i know i know i know i know i know i know i know i

Whatsapp Chat

In [0]:
import pandas as pd
import numpy as np
chat_data = pd.read_csv("chat.csv")

In [0]:
chat_data.head()

Unnamed: 0,Chat
0,"[12/03/19, 8:32:37 AM] Vidhi: ‎Messages to thi..."
1,"[12/03/19, 8:32:37 AM] Neel Doshi: Heyyy"
2,"[12/03/19, 8:39:16 AM] Vidhi: Heyyy"
3,Good morning
4,"[12/03/19, 8:39:39 AM] Neel Doshi: Waddup"


In [0]:
chat_data.shape

(67928, 1)

In [0]:
chat_corpus = []
for i in range(0,67928):
  d=chat_data["Chat"][i]
  d=re.sub(r"\W+|_", " ", d)
  print(d)
  chat_corpus.append(d)

In [0]:
# Python program to Remove all  
# digits from a list of string 
import re 
  
def remove(list): 
    pattern = 'PM'
    list = [re.sub(pattern, '', i) for i in list] 
    return list
  
 
chat_corpus = remove(chat_corpus) 

In [0]:
print(type(new_chat_corpus))

In [0]:
new_chat_corpus = []
for i in chat_corpus:
  i=i.strip()
  new_chat_corpus.append(i)

Write to text file

In [0]:
with open('chatfile.txt', 'w') as filehandle:
    for listitem in new_chat_corpus:
        filehandle.write('%s\n' % listitem)

Text Gen

In [0]:
file_name = "chatfile.txt"
new_model_name = 'whatsapp'   # change to set file name of resulting trained models/texts

In [0]:
textgen = textgenrnn(name=new_model_name)

train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file

train_function(
    file_path=file_name,
    new_model=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=1024,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings=100,
    word_level=model_cfg['word_level'])

NameError: ignored