In [1]:
%load_ext autoreload
%autoreload 2

In [109]:
import utils
from scipy import spatial
import statistics
import char_max_likelihood as cml
utils.setup_nltk()

[nltk_data] Downloading package stopwords to /home/user1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/user1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Generate some speeches

In [118]:
order=15
letters=2000
pres = "bush"
lm = cml.train_char_lm(pres, order)

for i in range(0,10):
    s = cml.generate_text(lm, order, letters)
    with open("../data/bush_generated/"+str(i)+".txt", "w") as text_file:
        # drop the last word since it is often non-sensical an thus not in the vocabulary for comparison
        text_file.write(s.rsplit(' ', 1)[0]) 

# Metrics based on tf-idf

- 10 real docs R, 10 generated docs G
- get tfidf for R and G (10 dicts each)
- calculate similarity (rank) for each g to all r in R -> 10 similarity values
- do that for all g in G -> 10 sets of 10 similarity values
- take mean over the set of 10 values for a g -> how good is this speach
- take mean over all 10 sets for G -> how good is the algorithm

- mean reciprocal rank (only takes rank)
- or np vector aus dict.values -> cosine similarity

## original speeches

In [29]:
tokens_docs = utils.read_all_text_files("bush")
tfidf_docs = utils.tfidf_per_doc(tokens_docs)

In [4]:
print(utils.top_n_per_document(tfidf_docs[0], n=20))

['empower', 'agreement', 'trust', 'Iraq', 'Al', 'surge', 'extremists', 'militia', 'earmarks', 'Iraqi', 'agreements', 'Qaeda', 'deny', 'Let', 'seven', 'year', 'past', 'products', 'ask', 'Lebanon']


In [92]:
#tfidf_docs[0]

{'returns': 0.0,
 'traveling': 0.0,
 'tiring': 0.0,
 'Administrator': 0.0,
 'Code': 0.0005743159815221268,
 'Dignity': 0.0,
 'chose': 0.0,
 'earn': 0.000322013439989896,
 'plutonium': 0.0,
 'liberator': 0.0,
 'Weldon': 0.0,
 'self-defense': 0.0,
 'inspiring': 0.0,
 'rides': 0.0,
 'bioterrorism': 0.0,
 'fend': 0.0,
 'profession': 0.0,
 'loyalty': 0.0,
 'antibiotics': 0.0,
 'rule': 0.0005952342247991577,
 'or': 9.077928646177155e-05,
 'dominating': 0.0,
 'hemisphere': 0.00046565519134185625,
 'desert': 0.0,
 '25,000': 0.0,
 'combination': 0.0,
 'coal-fired': 0.0,
 'Lower': 0.0,
 '120': 0.0,
 'knew': 0.0,
 'Hormuz': 0.0,
 'surest': 0.0,
 'dime': 0.0,
 'advantage': 0.0,
 '100,000': 0.0004020927037876684,
 'understandable': 0.0,
 'sovereignty': 0.0,
 'doubters': 0.0,
 'sophisticated': 0.0,
 'farther': 0.0,
 'example': 0.0,
 'embryonic': 0.0004020927037876684,
 'donated': 0.0,
 'fully': 0.0004818173615838161,
 'communities': 0.00026033807871874283,
 'solid': 0.0,
 'gotten': 0.0,
 'permanent'

## generated speeches

In [119]:
# vocabulary of all original speeches
voc = set([val for sublist in tokens_docs for val in sublist])

In [120]:
# calculate tfidf for generated speeches
gen_tokens_docs = utils.read_all_text_files("bush_generated")
gen_tfidf_docs = utils.tfidf_per_doc(gen_tokens_docs)

In [121]:
# append key:0.0 for words in vocab that are not found in the generated speeches to allow direct comparison
# also drops nonsensical words from the generated text (so far I have seen only 1 case where this happens)
gen_tfidf_docs_full = [{key:gen_tfidf_docs[i].get(key, 0.0) for key in voc} for i in range(0, len(gen_tfidf_docs))]
#gen_tfidf_docs_full[0]

## Cosine Similarity

In [124]:
# cosine similarity between 1 generated speech and 1 real speech
r = list(tfidf_docs[0].values())
g = list(gen_tfidf_docs_full[0].values())

1 - spatial.distance.cosine(r, g)

0.10478403353526622

In [125]:
# cosine similarity between 1 generated speech and all real speeches
cosine_similarities = []
g = list(gen_tfidf_docs_full[0].values())

for i in range(0, len(tfidf_docs)):
    r = list(tfidf_docs[i].values())
    cosine_similarities.append(1 - spatial.distance.cosine(r, g))
    
print("mean cosine similarity:", sum(cosine_similarities) / len(cosine_similarities))
print("standard deviation of cosine similarity:", statistics.stdev(cosine_similarities))
print("\n", cosine_similarities)

mean cosine similarity: 0.0686094275516573
standard deviation of cosine similarity: 0.04207875394606675

 [0.10478403353526622, 0.07007947764852585, 0.028532979523806512, 0.021176243843499587, 0.03637756937403536, 0.020010358613685053, 0.12067391131146221, 0.17703538489215975, 0.03450037604500855, 0.177994470856636, 0.05655198493979341, 0.03869591633931735, 0.05352768822641274, 0.05159886657921009, 0.025565882784505556, 0.04802490216524036, 0.03179880310067329, 0.04445420875750328, 0.13281541833893817, 0.09471585019523288, 0.10794090572167292, 0.08141986990554162, 0.05288896223295392, 0.0742350708843823, 0.0672702799988909, 0.026565613572904212, 0.0587228924018377, 0.015984674063799997, 0.032171738276334794, 0.05797922806165612, 0.07356135862977753, 0.04324427633992023, 0.09379241045382636, 0.12141186094092027, 0.05430511984927344, 0.03228561507322736, 0.1211788697855728, 0.0598565919887436, 0.13203800926248577]
