In [1]:
import math

import pandas as pd
from matplotlib import pyplot as plt
from opinionmining import *
from typing import Set

In [2]:
paths = []
for folder in os.listdir(full_path):
    try:
        for file in os.listdir(os.path.join(full_path, folder)):
            if str(file) != "Readme.txt" and str(file) != ".DS_Store":
                paths.append(os.path.join(full_path, folder, file))
    except NotADirectoryError:
        continue
paths[:3]

['/Users/jossinger/Dropbox/Studies/Bath_Artificial_Intelligence/Course Material/6_NLP/Programming/Submission/data/Reviews-9-products/norton.txt',
 '/Users/jossinger/Dropbox/Studies/Bath_Artificial_Intelligence/Course Material/6_NLP/Programming/Submission/data/Reviews-9-products/Nokia 6600.txt',
 '/Users/jossinger/Dropbox/Studies/Bath_Artificial_Intelligence/Course Material/6_NLP/Programming/Submission/data/Reviews-9-products/Hitachi router.txt']

In [3]:
SAVE = False
pwd = os.getcwd()
filename = "db.csv"
if SAVE:
    db = ReviewDatabase(paths).dataframe.to_csv(os.path.join(pwd, filename))
else:
    db = pd.read_csv(filename)

database = db.where(db["Product_ID"] == 3).copy().dropna()


In [4]:
database

Unnamed: 0.1,Unnamed: 0,Product_ID,Review_ID,Sentence_ID,Sentence,gt_categories,gt_score
923,923.0,3.0,94.0,924.0,I purchased this router at a woodworking show ...,"('performed',)","(2,)"
924,924.0,3.0,94.0,925.0,"Well, when I got it home and mounted it in my ...","('no sentiment',)","(0,)"
925,925.0,3.0,94.0,926.0,"The adjustment knob seemed ok, but when loweri...","('adjustment',)","(-1,)"
926,926.0,3.0,94.0,927.0,I tried it with the springs in and with them o...,"('no sentiment',)","(0,)"
927,927.0,3.0,94.0,928.0,The collet for the 1/4 inch bits is a pain to ...,"('collet',)","(-1,)"
...,...,...,...,...,...,...,...
1209,1209.0,3.0,122.0,1210.0,Overall it is a great value.,"('router',)","(2,)"
1210,1210.0,3.0,122.0,1211.0,For the price I don't think you can beat it.,"('price',)","(3,)"
1211,1211.0,3.0,123.0,1212.0,This is a fantastic tool to use.,"('tool',)","(3,)"
1212,1212.0,3.0,123.0,1213.0,Only problem is that is a bit heavy.,"('heavy',)","(-1,)"


In [5]:
database["ExtractedCategories"] = database["Sentence"].apply(lambda x : FeatureExtraction.categories(x))
database

Unnamed: 0.1,Unnamed: 0,Product_ID,Review_ID,Sentence_ID,Sentence,gt_categories,gt_score,ExtractedCategories
923,923.0,3.0,94.0,924.0,I purchased this router at a woodworking show ...,"('performed',)","(2,)","[show, router, bits, cmt, people]"
924,924.0,3.0,94.0,925.0,"Well, when I got it home and mounted it in my ...","('no sentiment',)","(0,)","[shortcomings, router, table]"
925,925.0,3.0,94.0,926.0,"The adjustment knob seemed ok, but when loweri...","('adjustment',)","(-1,)","[knob, router, adjustment]"
926,926.0,3.0,94.0,927.0,I tried it with the springs in and with them o...,"('no sentiment',)","(0,)","[springs, difference]"
927,927.0,3.0,94.0,928.0,The collet for the 1/4 inch bits is a pain to ...,"('collet',)","(-1,)","[inch, collet, pain, bits]"
...,...,...,...,...,...,...,...,...
1209,1209.0,3.0,122.0,1210.0,Overall it is a great value.,"('router',)","(2,)",[value]
1210,1210.0,3.0,122.0,1211.0,For the price I don't think you can beat it.,"('price',)","(3,)",[price]
1211,1211.0,3.0,123.0,1212.0,This is a fantastic tool to use.,"('tool',)","(3,)",[tool]
1212,1212.0,3.0,123.0,1213.0,Only problem is that is a bit heavy.,"('heavy',)","(-1,)","[problem, bit]"


What is the product?
What are the opinion categories?
What is the sentiment of the opinion categories?

In [6]:
#Create a copy of the database to perform feature extraction
category_table = database.loc[:,"Product_ID":"Sentence"].copy()
# Perform Stemming on the sentences
category_table["Stemmed_Sentence"] = database.Sentence.apply(lambda x: FeatureExtraction.stemming([x])[0])
# Remove the stop words
category_table["Clean_Sentence"] = category_table["Stemmed_Sentence"].apply(lambda x: FeatureExtraction.remove_stop([x])[0])

flattened_nouns = [item for sublist in database.ExtractedCategories for item in sublist]
frequency_sorted_nouns = [item for item, count in Counter(flattened_nouns).most_common()]
midpoint = len(frequency_sorted_nouns)//2
firsthalf = frequency_sorted_nouns[:midpoint]
secondhalf = frequency_sorted_nouns[midpoint:]



The difficulty here is in mapping between effectively a many to one (through the process of stemming) to a one to one, where each category can be read as the most common form of that category.  Relies on explicitly stated product categories. 

Use unsupervised learning to match stemmed review topics to broader content topics
- do this by vectorizing the 

In [7]:
D = FeatureExtraction.fuzzy_match_categories(test_categories=secondhalf, target_categories=firsthalf)
transactions = list(map(lambda x: FeatureExtraction.fuzzy_match_categories(x, D), database.ExtractedCategories))
stemmed_transactions = list(map(lambda x: FeatureExtraction.stemming(x), transactions))
category_table["Stemmed_Transactions"] = stemmed_transactions
item_set = [lst for lst in stemmed_transactions if len(lst) != 0]
category_table

Unnamed: 0,Product_ID,Review_ID,Sentence_ID,Sentence,Stemmed_Sentence,Clean_Sentence,Stemmed_Transactions
923,3.0,94.0,924.0,I purchased this router at a woodworking show ...,i purchas this router at a woodwork show after...,purchas router woodwork watch cmt peopl use de...,"[shop, router, bit, bit, peopl]"
924,3.0,94.0,925.0,"Well, when I got it home and mounted it in my ...",well when i got it home and mount it in my rou...,got home mount router tabl shortcom start,"[shortcom, router, tabl]"
925,3.0,94.0,926.0,"The adjustment knob seemed ok, but when loweri...",the adjust knob seem ok but when lower the rou...,adjust knob ok lower router practic pull turn ...,"[job, router, adjust]"
926,3.0,94.0,927.0,I tried it with the springs in and with them o...,i tri it with the spring in and with them out ...,tri spring notic ani differ,"[spring, differ]"
927,3.0,94.0,928.0,The collet for the 1/4 inch bits is a pain to ...,the collet for the 1/4 inch bit is a pain to g...,collet 1/4 inch bit pain harder,"[inch, collet, pain, bit]"
...,...,...,...,...,...,...,...
1209,3.0,122.0,1210.0,Overall it is a great value.,overal it is a great valu,overal great valu,[valu]
1210,3.0,122.0,1211.0,For the price I don't think you can beat it.,for the price i do n't think you can beat it,price think beat,[price]
1211,3.0,123.0,1212.0,This is a fantastic tool to use.,this is a fantast tool to use,fantast tool use,[tool]
1212,3.0,123.0,1213.0,Only problem is that is a bit heavy.,onli problem is that is a bit heavi,onli problem bit heavi,"[problem, bit]"


Create the stemming dictionary the remaps the stemmed words to the most commonly occuring original word

In [8]:
stemming_tuples = []
stemming_dict = {}
for i, l in enumerate(transactions):
    lst = []
    for j, word in enumerate(l):
        original_word = word
        stemmed_word = stemmed_transactions[i][j]
        stemming_tuples.append((stemmed_word, original_word))
stemming_tuples = sorted(stemming_tuples, key= lambda x: x[0])

key_valuelists = {k : [] for k, v in stemming_tuples}
for (key, value) in stemming_tuples:
    try:
        current_list = key_valuelists.get(key)
        current_list.append(value)
        key_valuelists[key] = current_list
    except KeyError as e:
        print(e)
#
stemming_dict = {k: Counter(v).most_common(1)[0][0] for k, v in key_valuelists.items()}


There is now a way to map back the stemmed features to the legible features that can be used in describing the categories so we can proceed with the apriori algorithm on the stemmed features

In [9]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

d = item_set
te = TransactionEncoder()
te_ary = te.fit(d).transform(d)
df = pd.DataFrame(te_ary, columns = te.columns_)
frequent_items = apriori(df, min_support=0.01, use_colnames=True)
frequent_items

Unnamed: 0,support,itemsets
0,0.014085,(action)
1,0.063380,(adjust)
2,0.017606,(amazon)
3,0.010563,(asset)
4,0.052817,(base)
...,...,...
231,0.017606,"(work, tabl, router)"
232,0.010563,"(spindl, tabl, work)"
233,0.010563,"(height, bit, adjust, job)"
234,0.010563,"(router, adjust, job, tabl)"


Feature Pruning

- features in the sets are not commutative 

In [10]:

doubles = []
def get_compact_phrases(frqt_words: List[set], test_sentences: List[List[str]], max_bounds = 4, min_bounds = 2):
    # for set check the set is within the bounds
    for s in frqt_words:
        if len(s) >= min_bounds and len(s) < max_bounds:
             # get the permutations of the set of words:
            compactness = test_compact_phrase(s, test_sentences)
            print(f"{compactness}\t{s}")
        
def create_permutations(frqt_set: set) -> List[list[str]]:
    return list(itertools.permutations(frqt_set,r=len(frqt_set)))

def compact_distance_between_consecutive_numbers_is_valid(lst:List[int], max_dist=3)->bool:
     # for each permutation calculate the distance between the words, if any of the numbers are larger than 3, continue
    for i in range(len(lst)-1):
        if lst[i+1] - lst[i] > max_dist:
            return False
    return True

def test_compact_phrase(phrase, sentences):
    set_words = set(phrase)
    compact_phrase_count = 0
    for sentence in sentences:
        if compact_phrase_count >= 2:
            return True
        sentence_str = sentence.split(" ")
        set_sentence = set(sentence_str)
        if set_words.issubset(set_sentence):
            indicies = [sentence_str.index(word) for word in set_words]
            is_compact = compact_distance_between_consecutive_numbers_is_valid(indicies)
            if is_compact:
                compact_phrase_count += 1
        else:
            continue
    return False

    
test_sents = list(category_table["Stemmed_Sentence"])
phrases = [phrs for phrs in frequent_items["itemsets"] if len(phrs) >= 2 and len(phrs) < 4]
single_phrases = [phrs for phrs in frequent_items["itemsets"] if len(phrs) < 2]
print(f"# of phrases : {len(phrases)}")
compact_phrases = [ph for ph in phrases if test_compact_phrase(ph, test_sents)]
compact_phrases



# of phrases : 158


[frozenset({'adjust', 'bit'}),
 frozenset({'adjust', 'depth'}),
 frozenset({'adjust', 'height'}),
 frozenset({'adjust', 'router'}),
 frozenset({'adjust', 'tabl'}),
 frozenset({'base', 'edg'}),
 frozenset({'bit', 'chuck'}),
 frozenset({'bit', 'collet'}),
 frozenset({'bit', 'control'}),
 frozenset({'bit', 'depth'}),
 frozenset({'bit', 'diamet'}),
 frozenset({'bit', 'guid'}),
 frozenset({'bit', 'hand'}),
 frozenset({'bit', 'height'}),
 frozenset({'bit', 'inch'}),
 frozenset({'bit', 'one'}),
 frozenset({'bit', 'panel'}),
 frozenset({'bit', 'router'}),
 frozenset({'bit', 'sleev'}),
 frozenset({'bit', 'speed'}),
 frozenset({'bit', 'thing'}),
 frozenset({'bit', 'work'}),
 frozenset({'bit', 'wrench'}),
 frozenset({'chuck', 'sleev'}),
 frozenset({'control', 'motor'}),
 frozenset({'control', 'router'}),
 frozenset({'control', 'speed'}),
 frozenset({'depth', 'router'}),
 frozenset({'depth', 'scale'}),
 frozenset({'depth', 'sleev'}),
 frozenset({'edg', 'guid'}),
 frozenset({'freehand', 'router'}),

Phrase ordering voting. 

In [11]:
def phrase_voting(phrases, sentences):
    phrase_dict = {k:list(itertools.permutations(k, len(k))) for k in phrases}
    max_perm_dict = {}
    for k, perms in phrase_dict.items():
        counts = [0] * len(perms)
        for sentence in sentences:
            sent_str = sentence.split(" ")
            if set(k).issubset(set(sent_str)):
                # get the ordering of the words in the sentence
                indexes = sorted([sent_str.index(word) for word in k if word in sent_str])
                ordered_words = [sent_str[i] for i in indexes]
                for i, perm in enumerate(perms):
                    # iterate through the permutations and count the occurrence of each ordering
                    if tuple(ordered_words) == perm:
                        counts[i] += 1

        # find the permutation with the maximum count
        max_count_index = counts.index(max(counts))
        max_perm_dict[k] = perms[max_count_index]
    return max_perm_dict      


results = phrase_voting(compact_phrases, test_sents)



p-support of feature `ftr` is the number of sentences that `ftr` appears in as a noun or noun phrase, and these sentences must contain no feature phrase that is a superset of `ftr`

In [12]:
def get_supersets(single_phrase:set, compact_phrases:List[set])->List[set]:
    return [phrase for phrase in compact_phrases if single_phrase.issubset(compact_phrases)]

def is_valid_noun(ftr, nouns):
    for noun in nouns:
        if ftr == noun.text:
            return True
    return False
            

def count_sentence(ftr, sentence, feature_phrases):
    # count the number of times ftr appears as a noun 
    # where the sentence does not contain a feature phrase that also contains ftr
    sentence_str = sentence.split(" ")
    sentence_set = set(sentence_str)
    if ftr.issubset(sentence_set):
        doc = nlp(sentence)
        nouns = [token for token in doc if token.pos_ in ["NOUN", "PROPN"]]
        # check if the noun list contains a feature phrase with the same ftr
        super_sets = get_supersets(ftr, compact_phrases=feature_phrases)
        if any(super for super in super_sets if ftr.issubset(super)) and not is_valid_noun(ftr, nouns):
            return 0
        else:
            return 1
    return 0

def p_support_pruning(single_phrases:List[set], sentences:List[set], compact_phrases:List[set], threshold=3):
    p_support = [0] * len(single_phrases)
    for i, ftr in enumerate(single_phrases):
        for sentence in sentences:
            p_support[i] += count_sentence(ftr, sentence, compact_phrases)
    return [phrase for i, phrase in enumerate(single_phrases) if p_support[i] > threshold]
        
reduced_single_phrases = p_support_pruning( single_phrases=single_phrases, 
                                            sentences=test_sents, 
                                            compact_phrases=compact_phrases)
combined_features = reduced_single_phrases + compact_phrases
combined_features
    

[frozenset({'action'}),
 frozenset({'adjust'}),
 frozenset({'amazon'}),
 frozenset({'base'}),
 frozenset({'bit'}),
 frozenset({'bosch'}),
 frozenset({'chuck'}),
 frozenset({'collet'}),
 frozenset({'control'}),
 frozenset({'depth'}),
 frozenset({'edg'}),
 frozenset({'freehand'}),
 frozenset({'guid'}),
 frozenset({'hand'}),
 frozenset({'handl'}),
 frozenset({'height'}),
 frozenset({'hitachi'}),
 frozenset({'hp'}),
 frozenset({'inch'}),
 frozenset({'lock'}),
 frozenset({'machin'}),
 frozenset({'mechan'}),
 frozenset({'motor'}),
 frozenset({'one'}),
 frozenset({'oper'}),
 frozenset({'panel'}),
 frozenset({'perform'}),
 frozenset({'plenti'}),
 frozenset({'plung'}),
 frozenset({'porter'}),
 frozenset({'price'}),
 frozenset({'problem'}),
 frozenset({'qualiti'}),
 frozenset({'review'}),
 frozenset({'rout'}),
 frozenset({'router'}),
 frozenset({'shop'}),
 frozenset({'sleev'}),
 frozenset({'speed'}),
 frozenset({'spring'}),
 frozenset({'start'}),
 frozenset({'switch'}),
 frozenset({'tabl'}),
 fr

## Adjective Extraction
For each sentence in the review database, if it contains
any frequent feature, extract the nearby adjective. If
such an adjective is found, it is considered an opinion
word. A nearby adjective refers to the adjacent
adjective that modifies the noun/noun phrase that is a
frequent feature.


In [20]:
opinion_sets = []

features = combined_features
sentences = database.Sentence.apply(lambda x: FeatureExtraction.clean_sentence(x))
for sentence in sentences:
    doc = nlp(sentence)
    sentence_set = set(token.text for token in doc)
    sentence_str = [token.text for token in doc]
    op_words = []
    for feature in features:
        if feature.issubset(sentence_set):
            feature_position = sentence_str.index(list(feature)[0])
            adjectives = [(feature_position, feature, i, token) for i, token in enumerate(doc) if token.pos_ == "ADJ"]
            op_words.append(adjectives)
    opinion_sets.append(set([tup for op in op_words for tup in op if len(op) > 0]))
opinion_sets

def get_closest_adjective(ary_sets: Set[Tuple[int, frozenset, int, str]]) -> Set[Tuple[int, frozenset, int, str]]:
    if len(ary_sets) == 0:
        return set()
    # Initialize a dictionary
    closest_sets = {}
    for feature_index, feature, adj_index, adj in ary_sets:
        if feature not in closest_sets:
            closest_sets[feature] = (float('inf'), None) 
        # Calculate distance between feature and adjective
        distance = abs(feature_index - adj_index)
        if distance < closest_sets[feature][0]:
            closest_sets[feature] = (distance, (feature_index, feature, adj_index, adj))
    # Extract the closest adjective n_ary from the dictionary
    return set(value[1] for value in closest_sets.values() if value[1])


infreq_opinions = []
for i, opinion_set in enumerate(opinion_sets):

    s = [t for t in nlp(sentences.to_list()[i])]
    closest_opinion_set = get_closest_adjective(opinion_set)
    if closest_opinion_set == set():
        infreq_opinions.append(set())
    op_set = set()
    for _, feature, i_adj, adj in closest_opinion_set:
        token = s[i_adj]

        if token.text == adj.text:
            if token.head.text in feature:
                continue
            if token.head.pos_ == "NOUN":
                op_set.add((adj.text, token.head.text))
            # print(f"token_head= {token.head}, adj = {adj.text}, adj_dep = {adj.dep_}, dep={token.head.dep_}, head_pos = {token.head.pos_}")
    infreq_opinions.append(op_set)
infreq_opinions
        


        
        


[set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {('fit', 'right')},
 {('larger', 'table')},
 {('best', 'price'), ('personal', 'opinions')},
 set(),
 {('single', 'speed')},
 {('variable', 'speed')},
 {('smaller', 'retrospect')},
 {('smooth', 'cuts')},
 {('heavier', 'router')},
 set(),
 set(),
 {('great', 'price'), ('variable', 'table')},
 {('easier', 'freehand'), ('single', 'router'), ('smaller', 'bits')},
 set(),
 set(),
 set(),
 {('poor', 'luck')},
 set(),
 set(),
 set(),
 set(),
 set(),
 {('easy', 'change')},
 {('previous', 'reviewer')},
 set(),
 {('nice', 'table')},
 set(),
 set(),
 {('big', 'router')},
 set(),
 set(),
 {('straight', 'table')},
 set(),
 {('best', 'table')},
 set(),
 set(),
 set(),
 set(),
 {('variable', 'control')},
 set(),
 set(),
 {('fine', 'knob'), ('parallel', 'guide')},
 {('common', 'bit')},
 set(),
 set(),
 set(),
 {('left', 'thumb'), ('right', 'finger'), ('straight', 'edge')},
 {('great', 'start'),
  ('powerful', 'motor'),
  (