#Assignment 4.2
#Zach Hill
#DSC-550-T302
#22DEC2019

Sentiment analysis is important in many areas, particularly consumer reviews. From movie critiques to product comments, even instagram and twitter posts based on locations, a positive or negative connotation in a review can have profound impact on what others will ultimately purchase or try; we are a social animal after all. Amazon reviews are seen by millions of customers so analysis of which products are meeting successful sales goals might be influenced by these reviews.

In [1]:
import pandas as pd
import json
from nltk.corpus import stopwords
import re
import os
import string
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

stop_words = stopwords.words('english')

In [2]:
def readFile(inputfile):
    data = []
    
    with open(inputfile, 'r') as datafile:
        data = json.load(datafile)
    
    outdf = pd.DataFrame(data)
    
    return outdf

In [3]:
def cleanText(df):
    punc = string.punctuation.replace('<', '').replace('>', '')
    pat = re.compile(f'[{punc}]')
    
    df = df.apply(lambda x: x.astype(str).str.lower())
    df = df.replace(pat, '')
    df = df.replace(r'\\n',' ', regex=True)
    df = df.replace(r'\\',' ', regex=True)
    
    return df

In [4]:
def createWordSet(df, col):
    results = set()
    df[col].str.split().apply(results.update)
    
    return results

In [5]:
files = ['asus.json', 'dell.json', 'surface.json']

asus_clean = cleanText(readFile(files[0]))
dell_clean = cleanText(readFile(files[1]))
surface_clean = cleanText(readFile(files[2]))

In [6]:
asus_txt = pd.DataFrame()
dell_txt = pd.DataFrame()
surface_txt = pd.DataFrame()

asus_txt['text'] = asus_clean['Review Title'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))
dell_txt['text'] = dell_clean['Review Title'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))
surface_txt['text'] = surface_clean['Review Title'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))

txt0 = asus_txt['text'].str.cat(sep=' ')
txt1 = dell_txt['text'].str.cat(sep=' ')
txt2 = surface_txt['text'].str.cat(sep=' ')

In [7]:
d0_words = {}
d1_words = {}
d2_words = {}

col = 'text'

d0_words = createWordSet(asus_txt, col)
d1_words = createWordSet(dell_txt, col)
d2_words = createWordSet(surface_txt, col)

In [8]:
def jaccard_distance(d1_words, d2_words):
    d1_unique = set(d1_words)
    d2_unique = set(d2_words)
    num_both = len(d1_unique.intersection(d2_unique))
    num_total = len(d1_unique.union(d2_unique))
    return num_both/num_total

In [9]:
words = ['asus','dell','surface']

lst_jd = {words[0]:[jaccard_distance(d0_words, d0_words),
                  jaccard_distance(d0_words, d1_words),
                  jaccard_distance(d0_words, d2_words)],
          words[1]:[jaccard_distance(d1_words, d0_words),
                  jaccard_distance(d1_words, d1_words),
                  jaccard_distance(d1_words, d2_words)],
          words[2]:[jaccard_distance(d2_words, d0_words),
                  jaccard_distance(d2_words, d1_words),
                  jaccard_distance(d2_words, d2_words)]}
  
# Create DataFrame 
df_jd = pd.DataFrame(lst_jd)

df_jd.set_index([words], inplace = True)

print(df_jd)

             asus      dell   surface
asus     1.000000  0.108647  0.200920
dell     0.108647  1.000000  0.099558
surface  0.200920  0.099558  1.000000


In [10]:
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = TfidfVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [11]:
def get_cosine_sim(*strs):
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)

In [12]:
cosim = get_cosine_sim(txt0, txt1, txt2)
cosim

array([[1.        , 0.42413415, 0.50365911],
       [0.42413415, 1.        , 0.37903314],
       [0.50365911, 0.37903314, 1.        ]])

In [13]:
df_cosim = pd.DataFrame({'asus':cosim[:,0],
                         'dell':cosim[:,1],
                         'surface':cosim[:,2]})
df_cosim.set_index([words], inplace = True)

print(df_cosim)

             asus      dell   surface
asus     1.000000  0.424134  0.503659
dell     0.424134  1.000000  0.379033
surface  0.503659  0.379033  1.000000
