# Summary and Topic Modelling 

In [1]:
from __future__ import division
import string
import math

In [83]:
tokenize = lambda doc: doc.lower().split(" ")
tokenize

<function __main__.<lambda>>

In [84]:
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_4 = "What's the future of Abenomics. We asked Shinzo Abe for his views"
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_6 = "Vladimir Putin is riding a horse while hunting deer. Vladimir Putin always seems so serious about things - even riding horses. Is he crazy?"

In [85]:
all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]

tokenized_documents = [tokenize(d) for d in all_documents] # tokenized docs
all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])

In [45]:
all_tokens_set

{'-',
 'a',
 'abe',
 'abe,',
 'abenomics.',
 'about',
 'accelerating',
 'against',
 'almost',
 'always',
 'an',
 'and',
 'as',
 'asked',
 'at',
 'china',
 'confronting',
 'corruption.',
 'country',
 'crazy?',
 'cuba',
 'daily.',
 'deer.',
 'differs',
 'domestic',
 'eased',
 'economic',
 'economy',
 'economy,',
 'economy.',
 'endemic',
 'even',
 'falls',
 'fix',
 'for',
 'from',
 'future',
 'greatly',
 'growing',
 'hard',
 'has',
 'he',
 'healing',
 'his',
 'horse',
 'horses.',
 'however',
 'hunting',
 'in',
 'is',
 'it',
 "japan's",
 'last,',
 'minister,',
 'obama',
 'of',
 'on',
 'own',
 'pace.',
 'people.',
 'politically',
 'prime',
 'problem:',
 'putin',
 'rapid',
 'riding',
 'ruble',
 "ruble's",
 'russia',
 'russian',
 'sanctions',
 'seems',
 'serious',
 'shinzo',
 'so',
 'strong',
 'that',
 'the',
 'things',
 'those',
 'to',
 'towards',
 'tumbled.',
 'turmoil',
 'us',
 'value',
 'view',
 'views',
 'violence',
 'vladimir',
 'we',
 "what's",
 'while',
 'working'}

## Jaccard Similarity

In [46]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [47]:
# comparing document_2 and document_4
jaccard_similarity(tokenized_documents[2],tokenized_documents[4])

0.21428571428571427

In [77]:
# Test String 
string_1 = 'I bought a car, I love you'
string_2 = 'I bought a car'


In [78]:
jaccard_similarity(string_1,string_2)

0.6875

## TF-IDF Similarity

In [66]:
def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)

In [67]:
#def sublinear_term_frequency(term, tokenized_document):
#    count = tokenized_document.count(term)
#    if count == 0:
#        return 0
#    return 1 + math.log(count)

def augmented_term_frequency(term, tokenized_document):
    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))

In [68]:
def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values

In [69]:
idf_values = inverse_document_frequencies(tokenized_documents)
print(idf_values['abenomics.'])
print(idf_values['the'])

2.9459101490553135
1.336472236621213


In [70]:
def tfidf(documents):
    tokenized_documents = [tokenize(d) for d in documents]
    idf = inverse_document_frequencies(tokenized_documents)
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            #tf = sublinear_term_frequency(term, document)
            tf = augmented_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents

In [71]:
tfidf_representation = tfidf(all_documents)
print(tfidf_representation[0])
print("String: ",document_0)

[1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.126381484247684, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.689572226371526, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 2.209432611791485, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 2.209432611791485, 1.126381484247684, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 2.209432611791485, 1.4729550745276567, 2.209432611791485, 1.4729550745276567, 1.126381484247684, 2.252762968495368, 1.4729550745276567, 2.209432611791485, 1.0023541774659097, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 1.47295507452765

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)

sklearn_representation = sklearn_tfidf.fit_transform(all_documents)
print(tfidf_representation[0])
print(sklearn_representation.toarray()[0].tolist())
print(document_0)

[1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.126381484247684, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.689572226371526, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 2.209432611791485, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 2.209432611791485, 1.126381484247684, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 2.209432611791485, 1.4729550745276567, 2.209432611791485, 1.4729550745276567, 1.126381484247684, 2.252762968495368, 1.4729550745276567, 2.209432611791485, 1.0023541774659097, 1.4729550745276567, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 1.4729550745276567, 1.126381484247684, 1.4729550745276567, 1.47295507452765

In [73]:
def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude

In [74]:
tfidf_representation = tfidf(all_documents)
our_tfidf_comparisons = []
for count_0, doc_0 in enumerate(tfidf_representation):
    for count_1, doc_1 in enumerate(tfidf_representation):
        our_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))

        
skl_tfidf_comparisons = []
for count_0, doc_0 in enumerate(sklearn_representation.toarray()):
    for count_1, doc_1 in enumerate(sklearn_representation.toarray()):
        skl_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))

for x in zip(sorted(our_tfidf_comparisons, reverse = True), sorted(skl_tfidf_comparisons, reverse = True)):
    print(x)

((1.0000000000000002, 4, 4), (1.0000000000000002, 6, 6))
((1.0000000000000002, 2, 2), (1.0000000000000002, 2, 2))
((1.0, 6, 6), (1.0000000000000002, 0, 0))
((1.0, 5, 5), (1.0, 5, 5))
((1.0, 3, 3), (1.0, 4, 4))
((1.0, 0, 0), (1.0, 3, 3))
((0.9999999999999998, 1, 1), (1.0, 1, 1))
((0.9770488629505589, 3, 2), (0.29310925698840595, 4, 2))
((0.9770488629505589, 2, 3), (0.29310925698840595, 2, 4))
((0.9699118646023268, 5, 3), (0.16506306906464616, 6, 3))
((0.9699118646023268, 3, 5), (0.16506306906464616, 3, 6))
((0.9690036262581565, 5, 2), (0.14060334967136984, 3, 2))
((0.9690036262581565, 2, 5), (0.14060334967136984, 2, 3))
((0.9689890437129889, 3, 0), (0.11766551247749867, 3, 0))
((0.9689890437129889, 0, 3), (0.11766551247749867, 0, 3))
((0.9672437545307783, 2, 0), (0.11478807222952396, 5, 3))
((0.9672437545307783, 0, 2), (0.11478807222952396, 3, 5))
((0.9665492674741823, 6, 3), (0.11212208176085793, 6, 1))
((0.9665492674741823, 3, 6), (0.11212208176085793, 1, 6))
((0.9615473116819654, 6, 

In [81]:
#### Testing Tf-idf
tokenized_documents

[['china',
  'has',
  'a',
  'strong',
  'economy',
  'that',
  'is',
  'growing',
  'at',
  'a',
  'rapid',
  'pace.',
  'however',
  'politically',
  'it',
  'differs',
  'greatly',
  'from',
  'the',
  'us',
  'economy.'],
 ['at',
  'last,',
  'china',
  'seems',
  'serious',
  'about',
  'confronting',
  'an',
  'endemic',
  'problem:',
  'domestic',
  'violence',
  'and',
  'corruption.'],
 ["japan's",
  'prime',
  'minister,',
  'shinzo',
  'abe,',
  'is',
  'working',
  'towards',
  'healing',
  'the',
  'economic',
  'turmoil',
  'in',
  'his',
  'own',
  'country',
  'for',
  'his',
  'view',
  'on',
  'the',
  'future',
  'of',
  'his',
  'people.'],
 ['vladimir',
  'putin',
  'is',
  'working',
  'hard',
  'to',
  'fix',
  'the',
  'economy',
  'in',
  'russia',
  'as',
  'the',
  'ruble',
  'has',
  'tumbled.'],
 ["what's",
  'the',
  'future',
  'of',
  'abenomics.',
  'we',
  'asked',
  'shinzo',
  'abe',
  'for',
  'his',
  'views'],
 ['obama',
  'has',
  'eased',
  'san

In [82]:
idf_values

{'-': 2.9459101490553135,
 'a': 2.252762968495368,
 'abe': 2.9459101490553135,
 'abe,': 2.9459101490553135,
 'abenomics.': 2.9459101490553135,
 'about': 2.252762968495368,
 'accelerating': 2.9459101490553135,
 'against': 2.9459101490553135,
 'almost': 2.9459101490553135,
 'always': 2.9459101490553135,
 'an': 2.9459101490553135,
 'and': 2.9459101490553135,
 'as': 2.252762968495368,
 'asked': 2.9459101490553135,
 'at': 2.252762968495368,
 'china': 2.252762968495368,
 'confronting': 2.9459101490553135,
 'corruption.': 2.9459101490553135,
 'country': 2.9459101490553135,
 'crazy?': 2.9459101490553135,
 'cuba': 2.9459101490553135,
 'daily.': 2.9459101490553135,
 'deer.': 2.9459101490553135,
 'differs': 2.9459101490553135,
 'domestic': 2.9459101490553135,
 'eased': 2.9459101490553135,
 'economic': 2.9459101490553135,
 'economy': 2.252762968495368,
 'economy,': 2.9459101490553135,
 'economy.': 2.9459101490553135,
 'endemic': 2.9459101490553135,
 'even': 2.252762968495368,
 'falls': 2.945910149