<a href="https://colab.research.google.com/github/nadeneAmara/CaptBot/blob/master/CaptBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from __future__ import division
import string
import nltk, re, pprint
from nltk import tokenize
from nltk import word_tokenize
from urllib import request
import json
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from pickle import dump
import requests

# Retrieve comments from pushshift reddit database
def request_comments(**kwargs):
  response = requests.get("https://api.pushshift.io/reddit/comment/search/",params=kwargs)
  data = response.json()
  comments = data['data']
  return comments

# Get text from comments
def get_comment_set(comment_number):
  sr_names = ["aww","wholesomememes","funny", "interestingasfuck", "EarthPorn"]
  comment_bodies = ""
  before = None
  for i in sr_names:
    comments_left = comment_number
    while (comments_left > 0):
      comments = request_comments(subreddit=i, size=100, before=before, sort='desc',sort_type='created_utc')
      for comment in comments:
        comment_bodies = comment_bodies + comment['body']
        before = comment['created_utc']
      comments_left = comments_left - 100
      time.sleep(2)
  return comment_bodies

# Save comments to file, line by line
def save_doc(sequences, filename):
	dataset = '\n'.join(sequences)
	file = open(filename, 'w')
	file.write(dataset)
	file.close()

# Load comments from file
def load_doc(filename):
	file = open(filename, 'r')
	text = file.read()
	file.close()
	return text

def load_text():
  # Make text all lowercase and split into sentences
  # Load in new raw here
  raw = get_comment_set(200)
  raw = raw.lower()
  raw_len = len(raw)
  raw = tokenize.sent_tokenize(raw)
  print(len(raw))
  return raw

# Generate overlapping sequences of words
def get_sequences(raw):
    sequences = []
    maxLen = 0
    for sequence in raw:
        token_list = word_tokenize(sequence)
        token_list = [token for token in token_list if token.isalpha()]
        i = 0
        while (i < (len(token_list)-1)):
            tokens = token_list[:i+1]
            line = ' '.join(tokens)
            sequences.append(line)
            i = i + 1
    filename = 'reddit_comments.txt'
    save_doc(sequences, filename)
    return sequences

# Map our words to integer values and split sequences into 
def prepare_sequences(sequences):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sequences)
    sequences = tokenizer.texts_to_sequences(sequences)
    maxLen = max([len(x) for x in sequences])
    num_vocab = len(tokenizer.word_index) + 1
    input_sequences = np.array(pad_sequences(sequences, maxlen = maxLen-1, padding = 'pre'))
    x = input_sequences[:,:-1]
    y = input_sequences[:,-1]
    y = to_categorical(y, num_classes=num_vocab)
    len_sequence = x.shape[1]
    # save the tokenizer
    dump(tokenizer, open('tokenizer.pkl', 'wb'))
    return x, y, len_sequence, num_vocab

def create_model(maxLen, num_features):
    model = Sequential()
    model.add(Embedding(num_features, 50, input_length=maxLen))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(num_features, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

#text = load_text()
#s = get_sequences(text)
filename = 'reddit_comments.txt'
sequences = load_doc(filename)
sequences = sequences.split('\n')
x, y, maxLen, num_features = prepare_sequences(sequences)
model = create_model(maxLen, num_features)

path = F"/content/drive/MyDrive/captbot.ckpt" 
#Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=path,
                                                 save_weights_only=False,
                                                 verbose=1)
model.fit(x, y, epochs=150, callbacks=[cp_callback])


# New Section

In [None]:
import tensorflow as tf
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import random

def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)
 
def generate_seq_diverse(model, tokenizer, seq_length, seed_text, n_words):
  result = list()
  in_text = seed_text
	# generate a fixed number of words
  for _ in range(n_words):
		# encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
    probabilities = model.predict(encoded)
    predictions = []
    for word, index in tokenizer.word_index.items():
      predictions.append({'text': in_text + ' ' + word, 
                          'score': probabilities[0][index]})
    predictions = sorted(predictions, key=lambda p: p['score'], reverse=True)
    top_predictions = []
    top_score = predictions[0]['score']
    min_score = 0.6
    rand_value = random.randint(int(min_score * 1000),1000)
    for p in predictions:
      if p['score'] >= rand_value/1000*top_score:
        top_predictions.append(p)
    random.shuffle(top_predictions)
    in_text = top_predictions[0]['text']
  return in_text

# load model 
new_model = tf.keras.models.load_model("/content/drive/My Drive/captbot.ckpt")
# load tokenizer
tokenizer = pickle.load(open('/content/drive/My Drive/tokenizer.pkl', 'rb'))
generated = generate_seq(new_model, tokenizer, 149, "A dog", 10)
generated2 = generate_seq_diverse(new_model, tokenizer, 149, "dog", 10)
print(generated)
print(generated2)