# Python for Text Analysis
This code was developed with help from the following online resources:
1. [Word2Vec, by Sankalp Kolhe](https://github.com/sankyfox/word2vec)
2. [Place context analysis using Natural Language Processing, by Bo Zhao](https://github.com/jebowe3/geog595/blob/master/06_ai/pe.md)
3. [Google News and Leo Tolstoy: Visualizing Word2Vec Word Embeddings using t-SNE, by Sergey Smetanin](https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d)

First, we need to install the necessary Python libraries. This should take about 6 minutes...

In [None]:
# Install needed python libraries
import sys
!conda update --all --yes
!conda install --yes --prefix {sys.prefix} gensim
!{sys.executable} -m pip install spacy
!{sys.executable} -m spacy download xx_ent_wiki_sm
!conda install --yes --prefix {sys.prefix} nltk
!conda install --yes --prefix {sys.prefix} -c conda-forge wordcloud
!{sys.executable} -m pip install geonamescache
!{sys.executable} -m pip install geocoder
!conda install --yes --prefix {sys.prefix} -c conda-forge stop-words
!conda install --yes --prefix {sys.prefix} -c conda-forge networkx
!conda install --yes --prefix {sys.prefix} python-levenshtein
!conda install --yes --prefix {sys.prefix} -c anaconda scikit-learn

### Save The Adventures of Sherlock Holmes as a txt File
Next, we will import a few Python modules to create a subdirectory called "txt" in our project folder where we will save The Adventures of Sherlock Holmes, downloaded from [Gutenberg](https://www.gutenberg.org/files/1661/1661-0.txt). We will call this file "advsherlock.txt" and use this for later analysis.

In [None]:
# Import modules
import os
import requests
import nltk
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import glob
import codecs

# Download punkt from nltk
nltk.download('punkt')

# Make a subdirectory called "txt" in your project folder
os.mkdir('txt')

# First, download The Adventures of Sherlock Holmes here: https://www.gutenberg.org/files/1661/1661-0.txt
url = 'https://www.gutenberg.org/files/1661/1661-0.txt'
r = requests.get(url, allow_redirects=True)

# Then, save in the subdirectory called "txt" in your project folder
open('txt/advsherlock.txt', 'wb').write(r.content)

# Finally, identify the text file 
raw_data_files = sorted(glob.glob("txt/*.txt"))

# Read the txt file and measure it by characters
raw_corpus_1 = u""
for file_name in raw_data_files:
    print("Reading '{0}' ...".format(file_name))
    with codecs.open(file_name,"r","utf-8") as f:
        raw_corpus_1 += f.read()
    print("Corpus is now {0} characters long".format(len(raw_corpus_1)))
    print

# Print the txt file contents
#print(raw_corpus_1)

### Remove Unnecessary Text from the txt File
Now, we will remove the extra text at the beginning and the end of the file, so that this is not used in the analysis.

In [None]:
# Then, remove unnecessary text at the beginning and end
raw_corpus_2 = raw_corpus_1.split('XII.   The Adventure of the Copper Beeches')[1]
raw_corpus = raw_corpus_2.split('*** END OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES ***')[0]

# Print the results
#print(raw_corpus)

### Clean, Lemmatize, and Tokenize Text
With the following code, we will build a list of stop words that we will remove from analysis. Stop words are common words that are not particularly meaningful, such as "the" or "nevertheless". Then, we will tokenize the text by sentence. This breaks the text into its component sentences for more effective processing. We will also lemmatize the text, which means that different forms of words (conjugations, declensions, etc.) will be assigned a standardized vocabulary to avoid distinctions between words like "see" and "saw".

In [None]:
# Clean, Lemmatize, and Tokenize Text

# Import modules
import re
import multiprocessing
from nltk.corpus import stopwords
from stop_words import get_stop_words

# Download nltk stopwords
nltk.download('stopwords')

# Download packages for the lemmatizer
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Define the stop words that will be removed from the results
stop_words = list(get_stop_words('en'))         #Have around 900 stopwords
nltk_words = list(stopwords.words('english'))   #Have around 150 stopwords
stop_words.extend(nltk_words)

# Extend the list with a few extra tangential words
stop_words.extend(['upon','well','may','shall','must','might','much','quite','however','away','yet','oh','ah','mr','miss'])

# Tokenize sentences in the text
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(raw_corpus)

# Define a function to lemmatize the text (consolidate similar words to a standard vocabulary)
def lemmatize_all(t):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(t)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('RB'):
            yield wnl.lemmatize(word, pos='r')
        else:
            yield word

# Define a function to remove non-alphabetic characters and stopwords and tokenize words by sentence
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ",raw)
    words = clean.split()
    words = [x.lower() for x in words]
    return words

# Add these tokens to a "sentences" list
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        # Lemmatize each sentence
        lemma_sentence = ' '.join(lemmatize_all(raw_sentence))
        # Remove non-alphabetic characters, tokenize words, and sent to the list
        sentences.append(sentence_to_wordlist(lemma_sentence))

# Remove the stopwords and add to a "new_sentences" list
new_sentences = []
for sublist in sentences:
    new_sublist = []
    for item in sublist:
        if item not in stop_words:
            new_sublist.append(item)
    new_sentences.append(new_sublist)
    
# Count the tokens
token_count = sum([len(s) for s in new_sentences])
print("The corpus has {0:,} tokens".format(token_count))

### Build, Train, and Save a Word2Vec Model
Here, we will make a Word2Vec model, which will produce word embeddings for our text. This will allow us to determine how similar certain words are to one another within our text, giving us a sense of the context within which important words exist within The Adventures of Sherlock Holmes.

In [None]:
# Build, Train, and Save a Model

import gensim.models.word2vec as w2v

# Set parameters for the Word2Vec model
num_features = 300 # The number of dimensions (N) of the N-dimensional space that gensim Word2Vec maps the words onto.
min_word_count = 5 # Ignores all words with total frequency lower than this.
num_workers = 1 # Use these many worker threads to train the model.
#num_workers = multiprocessing.cpu_count() # Use these many worker threads to train the model.
context_size = 5 # Maximum distance between the current and predicted word within a sentence - 5 words on each side.
downsampling = 1e-3 # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
seed = 2
epochs = 15 # Number of iterations (epochs) over the corpus.

# Define the model with the parameters
model = w2v.Word2Vec(sg=1,seed=seed,workers=num_workers,vector_size=num_features,min_count=min_word_count,window=context_size,sample=downsampling,epochs=epochs)

# Build the vocab
model.build_vocab(new_sentences)

print("Word2Vec vocabulary length: ",len(model.wv.vectors))

# Train the model on the sentences list
model.train(new_sentences, total_examples = token_count, epochs = model.epochs)

# save the trained file so we can load it anytime
if not os.path.exists('model'):
    os.mkdir('model')
model.save(os.path.join("model","model.w2v"))

model = w2v.Word2Vec.load(os.path.join("model","model.w2v"))


### Model Testing with Input Vocabulary
In this step, we will test the model we just produced by feeding it a few sample words. You can play around with this and try different words based on random curiosity or something you know about the stories. 

In [None]:
# Test the model

# Print words most similar to "watson"
print("Watson", model.wv.most_similar('watson', topn=50))

# Print words most similar to "sherlock"
print("Sherlock", model.wv.most_similar('sherlock', topn=50))

# Print the proximity of 'watson', 'mystery', 'irene', and 'solve' to 'sherlock'
print("Sherlock", model.wv.distances('sherlock', ('watson', 'mystery', 'irene', 'solve')))

### Remove Uninteresting Words
We want to retrieve the 50 most common words in the text. However, these include quite a few words we may not care to map in our network analysis. I have included the words that I want to exclude from analysis in the lists below, but you may want to keep some of these or exclude other words. This is one part of the process where your subjectivity comes into play. After excluding these words, we will print the new top 50.

In [None]:
model_vocab = model.wv.index_to_key

boring_words = ['one','two','back','yes','never','nothing','right','last','st','every','still','side']
boring_verbs = ['say','come','go','take','make','give','leave','get','put']
boring_adj = ['little','long','first','round','small']
boring_pronouns = ['us']


boring_words = boring_words + boring_verbs + boring_adj + boring_pronouns

selected_words = []
for word in model_vocab:
    if word not in boring_words:
        selected_words.append(word)

print(selected_words[:50])

### Make a Frequency Distribution of the Top 20 Desired Words
Now that we have a list of the most common words we want, let's do a quick analysis of each word's frequency of appearance in the text. After running the following code, you should see a graph of the frequency of the top 20 words among our list of 50.

In [None]:
from nltk import FreqDist

flat_sentences = [item for sublist in new_sentences for item in sublist]

top_words = []
for word in flat_sentences:
    if word in selected_words[:50]:
        top_words.append(word)
        
for word in [top_words]:
    fDist = FreqDist(word)
    # Produce a frequency plot
    fDist.plot(20)
    print(fDist.most_common(50))

### Make a Word Cloud of the Top 50 Desired Words
Different situations call for different visualizations. A word cloud is a nice, quickly digested, visualization of the relative importance of different words in a text. Larger words appear more frequently than smaller words. Here, we will produce a word cloud of the top 50 desired words, which we can also see with their counts in the list above. Can you get a better sense of the stories from this visualization?

In [None]:
# Create a word cloud of the most common (interesting) words
from wordcloud import WordCloud

print("generating wordcloud...")
# Define word cloud parameters
wc = WordCloud(background_color="white", max_words=50, prefer_horizontal=1, mask=None, scale=3, stopwords=stop_words, collocations=False)
# Generate the word cloud from the frequency distribution
wc.generate_from_frequencies(fDist)
# Make a subdirectory called "wordcloud" in your project folder
os.mkdir('wordcloud')
# Save the word cloud to a file in the new subdirectory
wc.to_file("wordcloud/advsherlock-cloud.jpg")
print("completed!")

### Isolate, Clean, and Sort Geographical Locations
We can use a natural language processing model called ["xx_ent_wiki_sm"](https://spacy.io/models/xx) from Spacy to identify all locations in our text. However, the process is imperfect and we need to clean out errors and fictional locations that we cannot map in QGIS. Through the steps below, we will load the model, use it to identify the locations in the text, clean out any non-locational words, check the results against a dictionary of known places, extend this dictionary with other places we know to be true (ex: "Colony of Victoria" is now a state in the independent nation of Australia). Then, we will sift our locations through these lists and create a list of fictional locations from any place name that falls through. This "hother" list is what we will use in the next step to finalize our clean list of mappable locations from The Adventures of Sherlock Holmes.

In [None]:
# Clean text and analyze to isolate locations
print("importing modules...")
import xx_ent_wiki_sm
print("importing complete")

print("loading language processing model...")
nlp_wk = xx_ent_wiki_sm.load()
print("loading language processing model complete")

new_text = raw_corpus.replace('\r\n',' ').replace('\r\n\r\n',' ').replace('\r\n    ',' ')

print("running language processing model on text...")
my_doc = nlp_wk(new_text)
print("running language processing model on text complete")

# these are words that appear in the results that are not places
geoStops = ['No', 't’', 'Study', 'Scarlet', 'Germanspeaking', 'Boots', 'House of Ormstein', 'Grand Duke of Cassel', 'stage—', 'Inner Temple', 'Temple', 'Where', 'née Adler', 'City', 'Well,’', 'Londoners', 'Architecture', 'B', 'Dissolved', 'Suburban Bank', 'Vegetarian Restaurant', 'Park', '‘Encyclopaedia,’', 'father—', '’77', 'Hall', 'Assizes', '’82', '’85', 'Colonel Openshaw', 'A', 'C’—', '’83', 'D.D.', 'd’you', 'Lascar', 'Dane', 'Museum—', 'Museum', 'birds—', '‘Which', 'Covent', 'Regency', 'Bengal Artillery', 'Exactly', 'Engineer', '16A', '” That', 'soul.’ He', 'Thumb  How', 'Esq', 'Cal', 'Serpen', 'England—', 'No,’', 'Twice', 'Jephro,’', 'Hers', 'Copper Beeches  Fowler and Miss Rucastle', 'Englishman', 'Upper Swandam Lane', ' ']

# the wikipedia natural language processing model can isolate location entities
print("reading each entity in text for geographical entities...")
# define an empty array
geoTxt = []
# for each entity in the processed text...
for ent in my_doc.ents:
    # if its label identifies it as a location...
    if ent.label_ == "LOC":
        # and if it is not in the listed geoStops...
        if ent.text not in geoStops:
            # append it to the array above
            geoTxt.append(ent.text)
print("geographical entities isolated")
    
# importing geonamescache provides us with a dictionary of global locations
print("sorting geographical entities")
import geonamescache

gc = geonamescache.GeonamesCache()

# gets nested dictionary for countries
countriesdict = gc.get_countries()
# gets nested dictionary for us states
statesdict = gc.get_us_states()
# gets nested dictionary for cities
citiesdict = gc.get_cities()

def gen_dict_extract(var, key):
    if isinstance(var, dict):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, (dict, list)):
                yield from gen_dict_extract(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from gen_dict_extract(d, key)

# we will define lists of all countries, states, and cities in geonamescache
countries = [*gen_dict_extract(countriesdict, 'name')]
states = [*gen_dict_extract(statesdict, 'name')]
cities = [*gen_dict_extract(citiesdict, 'name')]

# we will also create new lists for locations from the text that do not appear in the geonamescache lists - many of these are historical or colloquial
hcountries = ['Bohemia', 'England', 'Scotland', 'Great Britain', 'States', 'U.S.A.', 'kingdom of Bohemia']
hstates = ['Herefordshire', 'Colony of Victoria', 'Sussex', 'Southern states', 'Carolinas', 'Isle of Wight', 'Middlesex', 'Berkshire', 'Hampshire', 'Oxfordshire', 'Nova Scotia']
hcities = ['City of London', 'St. John’s Wood', 'Aldersgate', 'Eton', 'Marseilles', 'Pondicherry', 'Bloomsbury', 'Calcutta', 'Greenwich', 'Carlsbad', '’Frisco', 'Carlsbad. ‘Remarkable']
hstreets = ['Pall Mall']
hplaces = ['La Scala', 'Scotland Yard', 'America', 'Europe', 'Atlantic', 'Thames', 'Amoy River', 'St. Paul', 'St. Paul’s.’', 'St. James’s Hall I', 'St. James’s Hall', 'St. Saviour’s', 'King’s Cross', 'St. Pancras Hotel', 'Stroud Valley', 'Covent Garden', 'Covent Garden Market', 'Paddington', 'Pacific', 'Hatherley Farm', 'Balmoral', 'St. George’s', 'Westbury House', 'Serpentine', 'Holborn', 'Pentonville', 'Birchmoor', 'Petersfield', 'Charing Cross Station', 'Kensington I', 'London street', 'Saviour’s', 'Rockies', 'Montague Place', 'Black Swan Hotel', 'Charing Cross']
hother = ['Coburg Square', 'Saxe-Coburg Square', 'Fresno Street', 'Swandam Lane', 'Fordham', 'Eyford', 'Eyford Station', 'Coburg'] # Fictional locations

# for all the listed locations...
for txt in geoTxt:
    # if the location is in any of the geonamescache lists, 
    # or is a type of street or station or port,
    # add it to the new lists above
    if txt in countries:
        hcountries.append(txt)
    if txt in states:
        hstates.append(txt)
    if txt in cities:
        hcities.append(txt)
    if 'Street' in txt or 'Square' in txt or 'Road' in txt or 'Gate' in txt or 'Bridge' in txt or 'Lane' in txt or 'Wharf' in txt or 'Avenue' in txt or 'Station' in txt:
        hstreets.append(txt)
    # otherwise, add it to the "hother" list of fictional locations
    if txt not in hcountries and txt not in hstates and txt not in hcities and txt not in hstreets and txt not in hplaces:
        hother.append(txt)

print('done sorting geographical entities')

print('items to be removed: ', hother)

### Remove the Fictional Locations
Using the "hother" list, which we can see printed in our results above, we will clean our list of locations by removing the fictional places and non-locations. We will add the locations that do not match the words in this list to a new array called "locations". After running this code, you will see the new list of locations printed in the results.

In [None]:
# make a new list without fictional locations
print('removing fictional locations and making a new list of locations')
# define an empty array
locations = []

# for all the items in the "geoTxt" list...
for item in geoTxt:
    # if the item is not in the "hother" items to be removed list...
    if item not in hother:
        # append the item to the "locations" list
        locations.append(item)

print('new list complete')
print(locations)

### Create a Listed Frequency Distribution of all Locations
Before, we generated a graphed frequency distribution of common words. We could do this again for locations in the text, but let's just make a quick list that will order and count all the locations in our text.

In [None]:
# Define and print a frequency distribution of geographical entities in the text
print('creating a frequency distribution of mentioned locations')
frqDist = FreqDist(locations)
print(frqDist.most_common(160))
print('all done')

### Create a Graphed Frequency Distribution of the Top 20 Locations
Just to demonstrate, let's visualize only the 20 most common mentioned locations in The Adventures of Sherlock Holmes in a frequency plot.

In [None]:
# Calculate the place frequencies from the resulting list
for word in [frqDist]:
    lDist = FreqDist(word)
    # Produce a frequency plot
    lDist.plot(20)

### Create a Word Cloud of the Top 50 Locations
Just like we did with the most common words, we will make a word cloud of the 50 most-mentioned places in The Adventures of Sherlock Holmes. If you check the results, you will notice that many of the locations are in London, demonstrating how Holmes's sense of place in the world is largely defined by the city in which he lives. However, his geographical knowledge extends to some other surprising places, such as The United States, Australia, and India. Why might this be?

In [None]:
# Create a word cloud of the most common locations
from wordcloud import WordCloud

print("generating wordcloud...")
# Define word cloud parameters
wc = WordCloud(background_color="white", max_words=50, prefer_horizontal=1, mask=None, scale=3, stopwords=stop_words, collocations=False)
# Generate the word cloud from the frequency distribution
wc.generate_from_frequencies(frqDist)
# Save the word cloud to a file in the "wordcloud" subdirectory
wc.to_file("wordcloud/advsherlock-loc-cloud.jpg")
print("completed!")

### Generate a csv File for the Locations
The first step in making sure our locations can be mapped in GIS is to get them into a csv file with each place in its own separate row. We will also add the frequency count for each location so we can weight the results proportionally in QGIS.

In [None]:
# Create a csv file of the locations
print('creating a csv file of the locations...')
if not os.path.exists('csv'):
    os.mkdir('csv')
# write each location to a row in the spreadsheet and add its frequency count as a separate column
with open("csv/adv-sherlock-places.csv", "w", encoding="utf8") as fp:
    for item in frqDist.most_common(160):
        try:
            fp.write("%s, %d\n" % (item[0], item[1]))
            print(item)
        except TypeError as error:
            pass
print("finished!")

### Geocode the Locations
In this step, we will use a Python module called "geocoder" to obtain and add latitude and longitude data in new columns of our csv. At the end of this step, we will have a csv file with name, frequency, lat, and lng column headers with coordinate data for our locations.

In [None]:
# create an initial geocoded csv of locations
print('creating an initial geocoded csv of locations...')
import geocoder

with open("csv/adv-sherlock-places-geocoded.csv", "w", encoding="utf8") as geofp:
    geofp.write("name, frequency, lat, lng\n")
    with open("csv/adv-sherlock-places.csv", "r", encoding="utf8") as fp:
        for line in fp.readlines():
            location = line.split(",")[0]
            freq = int(line.split(",")[1])
            try:
                g = geocoder.arcgis(location)
                lat = g.current_result.lat
                lng = g.current_result.lng
                geofp.write("%s, %d, %f, %f\n" % (location, freq, lat, lng))
                print(location, freq, lat, lng)
            except:
                pass
print("finished!")

### Correct and Fine Tune Locational Data
An astute observer may notice that some of these coordinates are wrong. Moreover, some of the place names could use a little cleaning. For example, note that Colony of Victoria is listed with a latitude in the Northern Hemisphere. That cannot be Australia! Also, notice the name 'Frisco. This is a colloquialism for San Francisco that the geocoder cannot understand. Therefore, it returns the wrong coordinates. In other cases, such as street or station names, we need to add more specificity. There must be many Metropolitan Stations in the world. We want the one in London, England. If you are doing this step on your own, this is likely to take some time sifting through the results, cross-checking with the text, and looking at the locations in QGIS on a map for final verification. The result of this process is the following corrective code. After running this, you will have a csv that you can use in QGIS to map frequency-weighted locations from the text.

In [None]:
# fine tune location data to return corrected coordinates
print('revising locations to return corrected coordinates...')
revised_locations = locations.copy()

for i in range(len(revised_locations)):
    if revised_locations[i] == 'Baker Street?” I':
        revised_locations[i] = 'Baker Street'
    if revised_locations[i] == 'Lebanon':
        revised_locations[i] = 'Lebanon Pennsylvania'
    if revised_locations[i] == 'Bohemia':
        revised_locations[i] = 'Czech Republic'
    if revised_locations[i] == 'kingdom of Bohemia':
        revised_locations[i] = 'Czech Republic'
    if revised_locations[i] == 'Scotland Yard':
        revised_locations[i] = 'Great Scotland Yard'
    if revised_locations[i] == 'Reading':
        revised_locations[i] = 'Reading England'
    if revised_locations[i] == 'Serpentine':
        revised_locations[i] = 'The Serpentine London'
    if revised_locations[i] == 'Leadenhall Street':
        revised_locations[i] = 'Leadenhall Street London'
    if revised_locations[i] == 'Surrey':
        revised_locations[i] = 'Surrey England'
    if revised_locations[i] == 'Streatham':
        revised_locations[i] = 'Streatham London'
    if revised_locations[i] == 'Kent':
        revised_locations[i] = 'Kent England'
    if revised_locations[i] == 'Brixton Road':
        revised_locations[i] = 'Brixton Road London'
    if revised_locations[i] == 'Regent Street':
        revised_locations[i] = 'Regent Street London England'
    if revised_locations[i] == 'Kensington I':
        revised_locations[i] = 'Kensington London England'
    if revised_locations[i] == 'Fleet Street':
        revised_locations[i] = 'Fleet Street London'
    if revised_locations[i] == 'Oxford Street':
        revised_locations[i] = 'Oxford Street London'
    if revised_locations[i] == 'Paddington Station':
        revised_locations[i] = 'Paddington Station London'
    if revised_locations[i] == 'Charing Cross':
        revised_locations[i] = 'Charing Cross Station London'
    if revised_locations[i] == 'States':
        revised_locations[i] = 'United States'
    if revised_locations[i] == 'Threadneedle Street':
        revised_locations[i] = 'Threadneedle Street London'
    if revised_locations[i] == 'Bow Street':
        revised_locations[i] = 'Bow Street London England'
    if revised_locations[i] == 'Paddington':
        revised_locations[i] = 'Paddington Station London'
    if revised_locations[i] == 'Victoria Street':
        revised_locations[i] = 'Queen Victoria Street London England'
    if revised_locations[i] == 'Montague Place':
        revised_locations[i] = 'Montague Place London England'
    if revised_locations[i] == 'St. George’s':        
        revised_locations[i] = 'St. George’s Church London England'
    if revised_locations[i] == 'Hanover Square':
        revised_locations[i] = 'Hanover Square London England'
    if revised_locations[i] == 'Lancaster Gate':
        revised_locations[i] = 'Lancaster Gate London England'
    if revised_locations[i] == 'West End':
        revised_locations[i] = 'West End London England'
    if revised_locations[i] == 'Carlsbad':
        revised_locations[i] = 'Karlovy Vary'
    if revised_locations[i] == 'Carlsbad. ‘Remarkable':
        revised_locations[i] = 'Karlovy Vary'
    if revised_locations[i] == 'La Scala':
        revised_locations[i] = 'La Scala Milan'
    if revised_locations[i] == 'Serpentine Avenue':
        revised_locations[i] = 'Serpentine Avenue London'
    if revised_locations[i] == 'St. Paul':
        revised_locations[i] = 'St. Paul’s Cathedral London'
    if revised_locations[i] == 'St. James’s Hall':
        revised_locations[i] = 'St. James’s Hall London England'
    if revised_locations[i] == 'St. James’s Hall I':
        revised_locations[i] = 'St. James’s Hall London England'
    if revised_locations[i] == 'Aldersgate':
        revised_locations[i] = 'Aldersgate London England'
    if revised_locations[i] == 'Cornwall':
        revised_locations[i] = 'Cornwall England'
    if revised_locations[i] == 'Farrington Street':
        revised_locations[i] = 'Farringdon Street London England'
    if revised_locations[i] == 'St. Saviour’s':
        revised_locations[i] = 'St. Saviour’s Church Chalk Farm Station London'
    if revised_locations[i] == 'Saviour’s':
        revised_locations[i] = 'St. Saviour’s Church Chalk Farm Station London'    
    if revised_locations[i] == 'Severn':
        revised_locations[i] = 'River Severn'
    if revised_locations[i] == 'Victoria':
        revised_locations[i] = 'Victoria Australia'
    if revised_locations[i] == 'Colony of Victoria':
        revised_locations[i] = 'Victoria Australia'
    if revised_locations[i] == 'Hatherley Farm':
        revised_locations[i] = 'Hatherley Farm Gloucester England'
    if revised_locations[i] == 'Camberwell':
        revised_locations[i] = 'Camberwell London'
    if revised_locations[i] == 'Sussex':
        revised_locations[i] = 'Sussex England'
    if revised_locations[i] == 'East London':
        revised_locations[i] = 'East London England'
    if revised_locations[i] == 'Carolinas':
        revised_locations[i] = 'Hamer South Carolina'
    if revised_locations[i] == 'America':
        revised_locations[i] = 'United States'
    if revised_locations[i] == 'U.S.A.':
        revised_locations[i] = 'United States'
    if revised_locations[i] == 'Union':
        revised_locations[i] = 'United States'
    if revised_locations[i] == 'London Bridge':
        revised_locations[i] = 'London Bridge England'
    if revised_locations[i] == 'Paul’s Wharf':
        revised_locations[i] = 'Paul’s Wharf London'
    if revised_locations[i] == 'Cannon Street':
        revised_locations[i] = 'Cannon Street London'
    if revised_locations[i] == 'Middlesex':
        revised_locations[i] = 'Middlesex England'
    if revised_locations[i] == 'London Road':
        revised_locations[i] = 'London Road London'
    if revised_locations[i] == 'Wellington Street':
        revised_locations[i] = 'Wellington Street London England'
    if revised_locations[i] == 'Thames':
        revised_locations[i] = 'River Thames England'
    if revised_locations[i] == 'Pall Mall':
        revised_locations[i] = 'Pall Mall London England'
    if revised_locations[i] == 'Amoy River':
        revised_locations[i] = 'Amoy China'
    if revised_locations[i] == 'Wimpole Street':
        revised_locations[i] = 'Wimpole Street London England'
    if revised_locations[i] == 'Harley Street':
        revised_locations[i] = 'Harley Street London England'
    if revised_locations[i] == 'Wigmore Street':
        revised_locations[i] = 'Wigmore Street London England'
    if revised_locations[i] == 'Bloomsbury':
        revised_locations[i] = 'Bloomsbury London England'
    if revised_locations[i] == 'Covent Garden':
        revised_locations[i] = 'Covent Garden London England'
    if revised_locations[i] == 'Covent Garden Market':
        revised_locations[i] = 'Covent Garden Market London England'
    if revised_locations[i] == 'Greenwich':
        revised_locations[i] = 'Greenwich London England'
    if revised_locations[i] == 'Grosvenor Square':
        revised_locations[i] = 'Grosvenor Square London England'
    if revised_locations[i] == 'Pacific':
        revised_locations[i] = 'Pacific Slope'
    if revised_locations[i] == 'Hyde Park':
        revised_locations[i] = 'Hyde Park London England'
    if revised_locations[i] == 'Trafalgar  Square':
        revised_locations[i] = 'Trafalgar Square London England'
    if revised_locations[i] == 'Trafalgar Square':
        revised_locations[i] = 'Trafalgar Square London England'
    if revised_locations[i] == 'Gordon Square':
        revised_locations[i] = 'Gordon Square London England'
    if revised_locations[i] == 'Northumberland Avenue':
        revised_locations[i] = 'Northumberland Avenue London England'
    if revised_locations[i] == 'Metropolitan Station':
        revised_locations[i] = 'Metropolitan Station Buildings London England'
    if revised_locations[i] == 'Southampton Road':
        revised_locations[i] = 'Southampton Road London England'
    if revised_locations[i] == 'St. Paul’s.’':
        revised_locations[i] = 'St. Paul’s Cathedral London'
    if revised_locations[i] == 'Waterloo Bridge':
        revised_locations[i] = 'Waterloo Bridge London England'
    if revised_locations[i] == 'Waterloo Station':
        revised_locations[i] = 'Waterloo Station London England'
    if revised_locations[i] == 'High Street':
        revised_locations[i] = 'High Street Winchester England'
    if revised_locations[i] == 'Black Swan Hotel':
        revised_locations[i] = 'High Street Winchester England'
    if revised_locations[i] == 'London street':
        revised_locations[i] = 'London'
    if revised_locations[i] == 'Rockies':
        revised_locations[i] = 'Rocky Mountains'
    if revised_locations[i] == '’Frisco':
        revised_locations[i] = 'San Francisco'

# Define a frequency distribution of geographical entities in the text
frqDist = FreqDist(revised_locations)

# Create a csv file of the locations
with open("csv/adv-sherlock-places.csv", "w", encoding="utf8") as fp:
    for item in frqDist.most_common(160):
        try:
            fp.write("%s, %d\n" % (item[0], item[1]))
        except TypeError as error:
            pass

import geocoder

with open("csv/adv-sherlock-places-geocoded.csv", "w", encoding="utf8") as geofp:
    geofp.write("name, frequency, lat, lng\n")
    with open("csv/adv-sherlock-places.csv", "r", encoding="utf8") as fp:
        for line in fp.readlines():
            location = line.split(",")[0]
            freq = int(line.split(",")[1])
            try:
                g = geocoder.arcgis(location)
                lat = g.current_result.lat
                lng = g.current_result.lng
                geofp.write("%s, %d, %f, %f\n" % (location, freq, lat, lng))
                print(location, freq, lat, lng)
            except:
                pass
print("finished!")

### Create a Gephi File for Network Analysis
This code will create a gexf file, which we can open in Gephi to map networks of word associations in The Adventures of Sherlock Holmes. As a preview, the following code takes the top 50 words of interest from our frequency distribution and, for each, prints the 20 most similar words in the text, along with their degree of similarity.

In [None]:
import networkx as nx

g = nx.DiGraph()
items = fDist.most_common(50)
for item in items:
    g.add_nodes_from(item[0])
    try:
        mswords = model.wv.most_similar(item[0], topn=20)
        for msword in mswords:
            g.add_nodes_from(msword[0])
            g.add_edge(item[0], msword[0], weight=msword[1])
            print("%s --> %s: %8.5f" % (item[0], msword[0], msword[1]))
    except KeyError as error:
        print(error)

# Make a subdirectory called "gephi" in your project folder
os.mkdir('gephi')

# Write and save the gexf file
nx.write_gexf(g, "gephi/adv-sherlock.gexf", encoding='utf-8', prettyprint=True, version='1.1draft')
print("finished!")

### Other Things...
The following code blocks are just a few experiments and are not fully fleshed out.

In [None]:
from sklearn.manifold import TSNE
import numpy as np

# Make a key list from the most frequent 20 words in the model vocabulary
#keys = model.wv.index_to_key[:20]

# Or pick your own words to test
keys = ['sherlock','watson','mystery','train','woman','nature','good','evil']

embedding_clusters = []
word_clusters = []

for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in model.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline


def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()


tsne_plot_similar_words('Similar Words From The Adventures of Sherlock Holmes', keys, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')