In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import accuracy_score, classification_report, f1_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn, sentiwordnet as swn
from token_embeddings import generate_embeddings
from tokenize_clean_text import clean_text

# Instantiate the WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()


Generate Embeddings

In [3]:
#generate embeddings for our corpus, makes a file project_embeddings.csv and stores it in directory
generate_embeddings("reviews.csv")

Found 400000 word vectors in glove dict
Found 32221 unique tokens in corpus
Number of embeddings from corpus generated: 22264


In [4]:
embeddings_df = pd.read_csv("project_embeddings.csv", index_col=0)

Clustering Function

In [6]:
#clustering

def cluster_embeddings(df, num_of_clusters):
    
    kmeans = KMeans(n_clusters = num_of_clusters, random_state=42).fit(df)
    group_num = kmeans.labels_
    geo_centroids = kmeans.cluster_centers_
    
    #assign nearest word to geometric centroid in embedding space as centroid
    # find the index of the closest points from x to each class centroid
    close = pairwise_distances_argmin_min(geo_centroids, df, metric='euclidean')
    index_closest_points = close[0]
    word_centroids = df.iloc[index_closest_points].index
    
    #create dict of group number and centroids
    centroid_dict = {}
    for i in range(len(index_closest_points)):
        centroid_dict[i] = word_centroids[i]
    
    #create a dictionary of word and corresponding centroid
    
    #replace each label(group number) assigned by kmeans cluster algo with centroid word
    cen = [centroid_dict.get(group) for group in group_num]

    #create a dictionary
    word_centroid_dict = {}

    for i in range(df.shape[0]):
        word_centroid_dict[df.index[i]] = cen[i]
    
    return word_centroid_dict
    

Replace words in reviews with their cluster centroids and then calculate score

In [7]:
def replace_with_centroids(review, word_centroid_dict):
    
    new_review = [word_centroid_dict.get(word) if word in word_centroid_dict else word for word in review]
    
    return new_review    
    
    

def swn_classifier(review):
 
    sentiment = 0.0
    tokens_count = 0.0
    
    #Calculating score
    for word in review:
        
        lemma = lemmatizer.lemmatize(word)
        if not lemma:
            continue
 
        synsets = wn.synsets(lemma)
        if not synsets:
            continue
        
        # Take the first synset, the most common
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        print()
 
        #sentiment is the difference between positive and negative score
        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
        tokens_count += 1
 
    # Default: neither positive, nor negative
    if not tokens_count:
        return 0
 
    return sentiment

Calling the cluster function to generate a word centroid dictionary. Num is a hyperparameter to be adjusted for best accuracy

In [8]:
#calling the cluster function to generate a word centroid dictionary
num = 2000   #roughly 11 words per cluster
word_centroid_dict = cluster_embeddings(embeddings_df, num)
    

Import data file, process it(sentiment classification) and export it in required format

In [None]:
#import reviews file
#use both review title and review content columns to predict score
reviews = pd.read_csv("reviews.csv")

#arrays to store list of tokens, replaced words, scores
clean_tokens = []
replaced_tokens = []
y_predicted = []


#calling the main calculate function
for review in reviews["full_review"]:    
    clean_t = clean_text(review)
    clean_tokens.append(clean_t)
    
    replaced_t = replace_with_centroids(clean_t, word_centroid_dict)
    replaced_tokens.append(replaced_t)
    
    senti_score = swn_classifier(replaced_t)
    y_predicted.append(senti_score)

    
#Classify reviews according to setiment score assigned
#1 : positive, 0 : neutral, -1 : negative 
y_classified = []
for i in y_predicted:
    if i > 0:
        y_classified.append(1)
    elif i<0:
        y_classified.append(-1)
    elif i==0:
        y_classified.append(0)
        

#appending cols in df
reviews["tokens"] = clean_tokens
reviews["replaced_centroids"] = replaced_tokens
reviews["sentiment_score"] = y_predicted
reviews["predicted_sentiment"] = y_classified

#exporting df
reviews.to_csv("classified_full_review_embeddings.csv", header=True)


In [17]:
reviews.head()

Unnamed: 0,hotelname,rating,reviewcontent,reviewtitle,true_sentiment,full_review,tokens,replaced_centroids,sentiment_score,predicted_sentiment
0,Hotel Villa Piras,30,My husband and I stayed here for two nights. I...,"Poor Management, Nice Staff",0,"Poor Management, Nice Staff My husband and I s...","[poor, management, nice, staff, husband, staye...","[particularly, enterprise, maybe, office, daug...",0.875,1
1,Hotel Villa Piras,40,We arrived at the hotel late afternoon to a wa...,Lovely hotel on the hill,1,Lovely hotel on the hill We arrived at the hot...,"[lovely, hotel, hill, arrived, hotel, late, af...","[gorgeous, hotel, pine, arriving, hotel, came,...",1.75,1
2,Hotel Punta Negra,50,My wife and i stayed for 3 nights in early Sep...,Had a really lovely time,1,Had a really lovely time My wife and i stayed ...,"[really, lovely, time, wife, stayed, three, ni...","[maybe, gorgeous, though, daughter, stayed, ei...",0.75,1
3,Grande Baia Resort & SPA,30,PROS:,Honeymoon stay,0,Honeymoon stay PROS:,"[honeymoon, stay, pros]","[trips, stayed, aficionados]",0.0,0
4,Hotel Cala Cuncheddi,50,We have spent 2 holidays in this place and boo...,"Best family holidays, great service&beautiful ...",1,"Best family holidays, great service&beautiful ...","[best, family, holidays, great, service, beaut...","[well, daughter, christmas, well, services, go...",2.25,1


In [11]:
accuracy_score(reviews["true_sentiment"], y_classified)

0.7432107604971104

In [12]:
f1_score(reviews["true_sentiment"], y_classified, average=None)

array([0.19904332, 0.06845476, 0.85687319])

In [23]:
#Some examples:
for n in range(0,100,20):
    print("user_rating:",reviews["rating"][n] )
    print("review:",reviews["full_review"][n])
    print("tokens:",reviews["tokens"][n])
    print("replaced centroids:",reviews["replaced_centroids"][n])
    print("sentiment_score:",reviews["sentiment_score"][n],'\n')

user_rating: 30
review: Poor Management, Nice Staff My husband and I stayed here for two nights. It’s located a mile from the city center — a lovely walk along the water. 
tokens: ['poor', 'management', 'nice', 'staff', 'husband', 'stayed', 'two', 'nights', 'located', 'mile', 'city', 'center', 'lovely', 'walk', 'along', 'water']
replaced centroids: ['particularly', 'enterprise', 'maybe', 'office', 'daughter', 'stayed', 'eight', 'evenings', 'situated', 'kilometer', 'city', 'centers', 'gorgeous', 'walk', 'along', 'seawater']
sentiment_score: 0.875 

user_rating: 50
review: Wonderful people and place. Our stay was so amazing. The hosts Antonello and Vanna were very friendly and helpful. The breakfast was lovely and also the place was very authentic, tidy and clean. All the recommendations from them about routes, sightseeing, restaurants and beaches were very appreciated. We would definitely recommend La Peonia B&B to everyone who intends to visit this part of Sardinia.
tokens: ['wonderful