### Apply TF-IDF and cosine distance to check similarity 

In [1]:
from __future__ import division
import string
import math
import pandas as pd

In [2]:
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from autocorrect import spell
from replacers import RepeatReplacer, RegexpReplacer

In [3]:
df = pd.read_csv('test_save.csv')

In [4]:
df['Review Text'][0]

"It is and does exactly what the description said it would be and would do. Couldn't be happier with it."

In [5]:
def cleaning_text(sentence):
    regex = RegexpReplacer()
    repeat = RepeatReplacer()
    sentence = sentence.lower()
    words = [repeat.replace(i) for i in sentence.split(" ")]
    words = [regex.replace(i) for i in words]
    sentence = ' '.join(words)
    sentence = [spell(i) for i in sentence.split(" ")]
    sentence = ' '.join(words)
    sentence = [s for s in sentence if s not in string.punctuation]
    return ''.join(sentence)

In [6]:
all_reviews = []
for review in df['Review Text']:
    sentence = cleaning_text(review)
    all_reviews.append(sentence)

In [7]:
all_reviews[0]

'it is and does exactly what the description said it would be and would do could not be happier with it'

In [8]:
# Create a bag of unique words
def unique_word(all_document):
    tokenize = lambda doc: doc.lower().split(" ")
    tokenized_documents = [tokenize(d) for d in all_reviews] # tokenized docs
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    return [all_tokens_set, tokenized_documents]

In [38]:
list_words, tokenized_documents = unique_word(all_reviews)
tokenized_documents

[['it',
  'is',
  'and',
  'does',
  'exactly',
  'what',
  'the',
  'description',
  'said',
  'it',
  'would',
  'be',
  'and',
  'would',
  'do',
  'could',
  'not',
  'be',
  'happier',
  'with',
  'it'],
 ['i',
  'was',
  'sketchy',
  'at',
  'first',
  'about',
  'these',
  'but',
  'once',
  'you',
  'wear',
  'them',
  'for',
  'a',
  'couple',
  'hours',
  'they',
  'break',
  'in',
  'they',
  'fit',
  'good',
  'on',
  'my',
  'board',
  'an',
  'have',
  'little',
  'wear',
  'from',
  'skating',
  'in',
  'them',
  'they',
  'are',
  'a',
  'little',
  'heavy',
  'but',
  'will',
  'not',
  'get',
  'eaten',
  'up',
  'as',
  'bad',
  'by',
  'your',
  'grip',
  'tape',
  'like',
  'poser',
  'dc',
  'shoes'],
 ['very',
  'mobile',
  'product',
  'eficient',
  'easy',
  'to',
  'use',
  'however',
  'product',
  'needs',
  'a',
  'varmint',
  'guard',
  'critters',
  'are',
  'able',
  'to',
  'gorge',
  'themselves',
  'without',
  'a',
  'guard'],
 ['easy',
  'to',
  'us

## Set up tf-idf 

In [112]:
def term_frequency(term, tokenized_document):
    frequency = tokenized_document.count(term)
    return frequency

In [113]:
def sublinear_term_frequency(term, tokenized_document):
    count = tokenized_document.count(term)
    if count == 0:
        return 0
    return (1 + math.log(count))

#def augmented_term_frequency(term, tokenized_document):
#    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
#    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))

In [114]:
def inverse_document_frequencies(list_words, tokenized_documents):
    idf_values = {}
    #all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in list_words:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values

In [115]:
idf_values = inverse_document_frequencies(list_words, tokenized_documents)
idf_values

{'': 2.308332819650179,
 'everything': 5.0163830207523885,
 'behave': 6.402677381872279,
 'necessary': 6.402677381872279,
 'libro': 6.402677381872279,
 'watch': 5.0163830207523885,
 'though': 5.0163830207523885,
 'hotel': 6.402677381872279,
 'people': 4.610917912644224,
 'granted': 6.402677381872279,
 'disappointed': 5.30406509320417,
 'managed': 6.402677381872279,
 'protects': 6.402677381872279,
 'disjointed': 6.402677381872279,
 'correct': 5.30406509320417,
 'too': 5.0163830207523885,
 'samsung': 5.0163830207523885,
 'female': 6.402677381872279,
 'fans': 5.30406509320417,
 'mpeg': 6.402677381872279,
 'pages': 5.30406509320417,
 'clip': 6.402677381872279,
 'r': 5.709530201312334,
 'sample': 6.402677381872279,
 'hurt': 6.402677381872279,
 'will': 3.10684051586795,
 'refund': 5.30406509320417,
 'oneit': 6.402677381872279,
 'opener': 6.402677381872279,
 'hit': 5.30406509320417,
 'vegetable': 6.402677381872279,
 'england': 6.402677381872279,
 'money': 3.8377280244107426,
 'trigger': 6.402

In [116]:
def tfidf(tokenized_documents, idf):
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            #tf = augmented_term_frequency(term, document)
            tf = sublinear_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents

In [117]:
tfidf_representation = tfidf(tokenized_documents, idf_values)

In [118]:
# Measure distance of vector in space
def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude

In [119]:
our_tfidf_comparisons = []
for index1, review1 in enumerate(tfidf_representation):
    for index2, review2 in enumerate(tfidf_representation):
        our_tfidf_comparisons.append((cosine_similarity(review1, review2), index1, index2))

In [122]:
len(tfidf_representation)

222

In [111]:
print("String 1: ", tokenized_documents[0])
print("String 2: ", tokenized_documents[2])

String 1:  ['it', 'is', 'and', 'does', 'exactly', 'what', 'the', 'description', 'said', 'it', 'would', 'be', 'and', 'would', 'do', 'could', 'not', 'be', 'happier', 'with', 'it']
String 2:  ['very', 'mobile', 'product', 'eficient', 'easy', 'to', 'use', 'however', 'product', 'needs', 'a', 'varmint', 'guard', 'critters', 'are', 'able', 'to', 'gorge', 'themselves', 'without', 'a', 'guard']


## Testing the method

In [103]:
tokenize = lambda doc: doc.lower().split(" ")
string_1 = "I have a camera, you should buy one now"
string_2 = "the camera is good, I should buy one"
string_3 = "Wow, a camera"
string_4 = "Wow, a camera. Wow, a new camera"
string_5 = "Have you see the new camera"
string_6 = "A dog, I love dog than cat. I have a dog, her name is jep"
array_string = [string_1, string_2, string_3, string_4, string_5]
array_test = []
for i in array_string:
    array_test.append(tokenize(cleaning_text(i)))

array_test

[['i', 'have', 'a', 'camera', 'you', 'should', 'buy', 'one', 'now'],
 ['the', 'camera', 'is', 'god', 'i', 'should', 'buy', 'one'],
 ['wow', 'a', 'camera'],
 ['wow', 'a', 'camera', 'wow', 'a', 'new', 'camera'],
 ['have', 'you', 'see', 'the', 'new', 'camera']]

In [104]:
tfidf_representation_test = tfidf(array_test, idf_values)
tfidf_representation_test

[[0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,

In [105]:
for index_1, value_1 in enumerate(tfidf_representation_test):
    for index_2, value_2 in enumerate(tfidf_representation_test):
        similarity = cosine_similarity(value_1, value_2)
        print("String 1:", array_string[index_1])
        print("String 2:", array_string[index_2])
        print(similarity)
        print("-------\n")

String 1: I have a camera, you should buy one now
String 2: I have a camera, you should buy one now
1.0
-------

String 1: I have a camera, you should buy one now
String 2: the camera is good, I should buy one
0.6504322926113169
-------

String 1: I have a camera, you should buy one now
String 2: Wow, a camera
0.5068696281484377
-------

String 1: I have a camera, you should buy one now
String 2: Wow, a camera. Wow, a new camera
0.4587923312234788
-------

String 1: I have a camera, you should buy one now
String 2: Have you see the new camera
0.41717030779951414
-------

String 1: the camera is good, I should buy one
String 2: I have a camera, you should buy one now
0.6504322926113169
-------

String 1: the camera is good, I should buy one
String 2: the camera is good, I should buy one
1.0
-------

String 1: the camera is good, I should buy one
String 2: Wow, a camera
0.4808618108018685
-------

String 1: the camera is good, I should buy one
String 2: Wow, a camera. Wow, a new camera
0