In [1]:
from collections import Counter
import pandas as pd
from textblob import TextBlob
from nltk.corpus import stopwords
import nltk
import string
import re
import spacy
import numpy as np
STOP_WORDS = set(stopwords.words('english'))

# Data Preprocessing

In [2]:
with open('data/moby_dick.txt', 'r') as f:
    txt_moby_dick = f.read()

In [3]:
with open('data/jungle_book.txt', 'r') as f:
    txt_jungle_book = f.read()

In [4]:
len(txt_jungle_book)

293051

In [5]:
len(txt_moby_dick)

1238254

##### Tokenize 

In [6]:
def words(text):
    "List all the word tokens (consecutive letters) in a text. Normalize to lowercase."
    return re.findall('\w+', text) 

In [7]:
tokens_moby_dick = words(txt_moby_dick)
tokens_jungle_book = words(txt_jungle_book)

##### Word Counts

In [8]:
counter_moby_dick = Counter(tokens_moby_dick)
counter_jungle_book = Counter(tokens_jungle_book)
counter_moby_dick.most_common(5)

[('the', 13972), ('of', 6699), ('and', 6144), ('a', 4648), ('to', 4635)]

In [9]:
words_moby_dick = [word for word in tokens_moby_dick if word not in STOP_WORDS]
words_jungle_book = [word for word in tokens_jungle_book if word not in STOP_WORDS]


In [10]:
counter_moby_dick = Counter(words_moby_dick)
counter_jungle_book = Counter(words_jungle_book)


In [11]:
top_words_jungle_book = [x[0] for x in counter_jungle_book.most_common(20000)]

In [12]:
top_words_moby_dick = [x[0] for x in counter_moby_dick.most_common(20000)]

##### Filter top words by part of speech

In [13]:
nltk_moby_dick = nltk.Text(top_words_moby_dick)
tags_moby_dick = nltk.pos_tag(nltk_moby_dick)

In [14]:
nltk_jungle_book = nltk.Text(top_words_jungle_book)
tags_jungle_book = nltk.pos_tag(nltk_jungle_book)

# Modeling (GloVe)

##### Replace adjectives of moby dick with those from jungle book using vector similarity

In [15]:
common_adjectives_moby_dick = []
common_adjectives_jungle_book = []

common_nouns_moby_dick = []
common_nouns_jungle_book = []


for tag in tags_moby_dick:
    if tag[1] == 'JJ':
        common_adjectives_moby_dick.append(tag[0])
        
for tag in tags_jungle_book:
    if tag[1] == 'JJ':
        common_adjectives_jungle_book.append(tag[0])

for tag in tags_moby_dick:
    if tag[1] == 'NN':
        common_nouns_moby_dick.append(tag[0])
        
for tag in tags_jungle_book:
    if tag[1] == 'NN':
        common_nouns_jungle_book.append(tag[0])
        

In [16]:
def load_glove(filename):
    dictionary = {}

    raw = open(filename)
    lines = raw.readlines()

    # Iterate through lines getting the word and np vector
    for line in lines:
        line = line.split(" ")
        dWord = line[0]
        vector = line[1:]
        dictionary[dWord] = np.array(vector, dtype=float)

    return dictionary

In [17]:
# Computes the closest words in the dictionary to the passed word
def closest_word(gloves, word, corpus):
    distances = []
    arr1 = gloves[word]

    for word2 in corpus:
        if (word != word2):
            try:
                arr2 = gloves[word2]
            except:
                continue
            difference = arr2 - arr1
            distance = np.linalg.norm(difference)
            distanceWordTuple = (distance, word2)
            distances.append(distanceWordTuple)

    distances = sorted(distances)
    return distances[0][1]

In [18]:
gloves = load_glove('data/glove.6B.200d.txt')

##### Find the nearest words in each corpus

In [19]:
adj_replacements = []
for word in common_adjectives_jungle_book:
    if len(word) < 2:
        continue
    try:
        adj_replacements.append((word, closest_word(gloves, word, common_adjectives_moby_dick)))
    except:
        continue

In [20]:
noun_replacements = []
for word in common_nouns_jungle_book:
    if len(word) < 2:
        continue
    try:
        noun_replacements.append((word, closest_word(gloves, word, common_nouns_moby_dick)))
    except:
        continue

##### Replace similar words

##### Moby Dick

In [21]:
txt_sample = txt_moby_dick[:int(len(txt_moby_dick)/2)]

In [22]:
for rep in adj_replacements:
    txt_sample = re.sub('\\b' + rep[1]+ '\\b', rep[0], txt_sample)

In [23]:
for rep in noun_replacements:
    txt_sample = re.sub('\\b' + rep[1]+ '\\b', rep[0], txt_sample)

In [24]:
f = open('output/moby_dick_new.txt','w')
f.write(txt_sample)
f.close()

##### Jungle Book

In [25]:
txt_sample = txt_jungle_book[:int(len(txt_jungle_book)/2)]

In [26]:
for rep in adj_replacements:
    txt_sample = re.sub('\\b' + rep[0]+ '\\b', rep[1], txt_sample)

In [27]:
for rep in noun_replacements:
    txt_sample = re.sub('\\b' + rep[1]+ '\\b', rep[0], txt_sample)

In [28]:
f = open('output/jungle_book_new.txt','w')
f.write(txt_sample)
f.close()

In [41]:
noun_replacements

[('man', 'woman'),
 ('head', 'hand'),
 ('time', 'come'),
 ('jungle', 'swamp'),
 ('night', 'morning'),
 ('cub', 'grizzly'),
 ('day', 'week'),
 ('way', 'come'),
 ('kill', 'shoot'),
 ('look', 'come'),
 ('water', 'supply'),
 ('half', 'rest'),
 ('run', 'start'),
 ('place', 'time'),
 ('sea', 'coast'),
 ('horse', 'cat'),
 ('mother', 'daughter'),
 ('ground', 'way'),
 ('tail', 'tip'),
 ('fight', 'battle'),
 ('thing', 'something'),
 ('make', 'making'),
 ('wolf', 'hunter'),
 ('afraid', 'fearful'),
 ('village', 'town'),
 ('tiger', 'elephant'),
 ('nothing', 'anything'),
 ('side', 'way'),
 ('ran', 'saw'),
 ('neck', 'shoulder'),
 ('tree', 'pine'),
 ('foot', 'hand'),
 ('eat', 'meal'),
 ('anything', 'nothing'),
 ('life', 'time'),
 ('use', 'example'),
 ('hunt', 'hunter'),
 ('herd', 'bison'),
 ('grass', 'pasture'),
 ('tell', 'come'),
 ('art', 'museum'),
 ('dance', 'music'),
 ('gun', 'weapon'),
 ('set', 'break'),
 ('year', 'month'),
 ('camp', 'tent'),
 ('caught', 'thought'),
 ('thought', 'fact'),
 ('somet