In [26]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
import csv 
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.cluster import DBSCAN,AgglomerativeClustering

In [27]:
client = TextAnalyticsClient(endpoint=endpoint, credential=AzureKeyCredential(subscription_key))

In [28]:
textToAnalize = ["By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement.."]

In [29]:
# sentiment analysis - azure client 
result = client.analyze_sentiment(textToAnalize, show_opinion_mining=True)
docs = [doc for doc in result if not doc.is_error]

print("Sentiment of this text: ")
for idx, doc in enumerate(docs):
    print(f"Document text: {textToAnalize[idx]}")
    print(f"Overall sentiment: {doc.sentiment}")

Sentiment of this text: 
Document text: By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement..
Overall sentiment: positive


In [30]:
def loadData(fileName):
    data = []
    with open(fileName) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    
    inputs = [data[i][0] for i in range(len(data))][:100]
    outputs = [data[i][1] for i in range(len(data))][:100]
    labelNames = list(set(outputs))
    
    return inputs, outputs, labelNames

crtDir = os.getcwd()
fileName = os.path.join(crtDir, 'data/reviews_mixed.csv')

inputs, outputs, labelNames = loadData(fileName)
print(inputs[:3])

['The rooms are extremely small, practically only a bed.', 'Room safe did not work.', 'Mattress very comfortable.']


In [31]:
# split  the data in train and test
def splitData(inputs, outputs):
    np.random.seed(5)
    noSamples = len(inputs)
    indexes = [i for i in range(noSamples)]
    trainSample = np.random.choice(indexes, int(0.8 * noSamples), replace=False)
    testSample = [i for i in indexes if not i in trainSample]
    
    trainInputs = [inputs[i] for i in trainSample]
    trainOutputs = [outputs[i] for i in trainSample]
    testInputs = [inputs[i] for i in testSample]
    testOutputs = [outputs[i] for i in testSample]

    return trainInputs, trainOutputs, testInputs, testOutputs

trainInputs, trainOutputs, testInputs, testOutputs = splitData(inputs, outputs)

In [32]:
# extract some features from the raw text
# Bag of Words
def extractFeaturesBoW(trainInputs, testInputs):
    vectorizer = CountVectorizer()
    trainFeatures = vectorizer.fit_transform(trainInputs)
    testFeatures = vectorizer.transform(testInputs)
    
    # Additional information
    # vocabulary from the train data 
    vocabWords = list(vectorizer.vocabulary_.keys())[-20:]
    # extracted features
    sampleFeatures = trainFeatures.toarray()[:3]
    
    print("Vocabulary size:", len(vectorizer.vocabulary_), "words")
    print("Train data size:", len(trainInputs), "emails")
    # shape of feature matrix
    print("Train features shape:", trainFeatures.shape)
    print("Some words of the vocab:", vocabWords)
    print("Some features:", sampleFeatures)
    
    return trainFeatures, testFeatures, vectorizer

trainFeatures, testFeatures, vectorizer = extractFeaturesBoW(trainInputs, testInputs)

Vocabulary size: 341 words
Train data size: 80 emails
Train features shape: (80, 341)
Some words of the vocab: ['ran', 'until', 'jiggled', 'handle', 'showers', 'renovations', 'they', 'seem', 'anything', 'else', 'bedroom', 'sofa', 'unconfortable', 'springy', 'everything', 'beds', 'greatest', 'futon', 'sleeper', 'couch']
Some features: [[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [33]:
# extract some features from the raw text
# tf-idf features - word granularity
def extractFeatures_tf_idf(trainInputs, testInputs):
    vectorizer = TfidfVectorizer(max_features=50)
    trainFeatures = vectorizer.fit_transform(trainInputs)
    testFeatures = vectorizer.transform(testInputs)
    
    # Additional information
    # vocabulary from the train data 
    vocabWords = list(vectorizer.get_feature_names_out())[:10]
    # extracted features
    sampleFeatures = trainFeatures.toarray()[:3]
    
    print("Vocabulary:", vocabWords)
    print("Features:", sampleFeatures)
    
    return trainFeatures, testFeatures, vectorizer

trainFeatures, testFeatures, vectorizer = extractFeatures_tf_idf(trainInputs, testInputs)

Vocabulary: ['all', 'and', 'are', 'area', 'bathroom', 'bed', 'bit', 'clean', 'cold', 'comfortable']
Features: [[0.         0.14603507 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.51211449 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.34685183 0.19759403 0.20555329 0.20555329 0.         0.
  0.         0.         0.         0.         0.         0.
  0.3238264  0.         0.17861231 0.         0.         0.
  0.45121804 0.         0.         0.         0.19759403 0.
  0.22560902 0.20555329]
 [0.         0.81777684 0.         0.         0.         0.
  0.         0.57553543 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0

In [34]:
def featureComputation(vectorizer, data):
    # Fit vectorizer on data to learn vocabulary and compute TF-IDF features
    features = vectorizer.fit_transform(data).toarray()
    return features
    
trainFeatures = featureComputation(vectorizer, trainInputs)
testFeatures = featureComputation(vectorizer, testInputs)

print('Vocabulary: ', vectorizer.get_feature_names_out()[:10])
print('Features: ', trainFeatures[:3])

Vocabulary:  ['aircon' 'and' 'are' 'bathroom' 'bed' 'building' 'cleaned' 'comfortable'
 'filthy' 'for']
Features:  [[0.         0.14603507 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.51211449 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.34685183 0.19759403 0.20555329 0.20555329 0.         0.
  0.         0.         0.         0.         0.         0.
  0.3238264  0.         0.17861231 0.         0.         0.
  0.45121804 0.         0.         0.         0.19759403 0.
  0.22560902 0.20555329]
 [0.         0.81777684 0.         0.         0.         0.
  0.         0.57553543 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.     

In [35]:
# train the model
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(trainFeatures)

computedTestIndexes = kmeans.predict(testFeatures)
computedTestOutputs = [labelNames[value] for value in computedTestIndexes]
for i in range(0, len(testInputs)):
    print(testInputs[i], " -> ", computedTestOutputs[i])

The bed is very comfortable.  ->  positive
Very spacious rooms, quiet and very comfortable.  ->  positive
Corridors filthy
Room filthy
Electrical cables in room not safe
Whole building smelly
Shower repulsive  ->  positive
walls seem to have no sound insulation  ->  positive
The building was under renovation,  ->  positive
no elevator might be a challenge for some people  ->  negative
The bed was highly uncomfortable, although the engineer fixed it  ->  positive
bed, smell.  ->  positive
Detest the glass "door" if shower/tub .. with?  ->  positive
this was expected, clean towels and room cleaned every day.  ->  positive
More plug outlets with surge protectors.  ->  positive
Room was very spacious  ->  negative
Roof terrace great  ->  positive
No tea or coffee making facilities in the rooms  ->  positive
the room had aircon and we had earplugs and slept soundly.  ->  positive
Also, when the bright bathroom lights are turned on, it lights up the whole hotel room, shining thru the frosted

In [36]:
# calculate performance
print("Accuracy before the new feature: ", accuracy_score(testOutputs, computedTestOutputs))

Accuracy before the new feature:  0.3


In [37]:
# make the prediction for our text 
textFeatures = vectorizer.transform(textToAnalize)
textClusterIndex = kmeans.predict(textFeatures)[0]
computedTestOutputs = labelNames[textClusterIndex]
print("Prediction: ", computedTestOutputs)

Prediction:  positive


In [38]:
# Alternativs for k-Means:
def alternativeClustering(trainFeatures, testFeatures, testOutputs, labelNames):
    # Dbscan
    # min_sample nr of points to be in a region for beeing a cluster
    # 0.5 the radius between 2 points to be considerated togheter
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    dbscanLabelsTrain = dbscan.fit_predict(trainFeatures)

    # Performance
    dbscanLabelsTest = dbscan.fit_predict(testFeatures)
    dbscanTestOutputs = [labelNames[value] for value in dbscanLabelsTest]
    dbscanAccuracy = accuracy_score(testOutputs, dbscanTestOutputs)
    print("DBSCAN Accuracy:", dbscanAccuracy)
    
    # Agglomerative Clustering
    # n - clusters = 2 (positive / negative)
    agglo = AgglomerativeClustering(n_clusters=2)
    aggloLabelsTrain = agglo.fit_predict(trainFeatures)
    
    aggloLabelsTest = agglo.fit_predict(testFeatures)
    aggloTestOutputs = [labelNames[value] for value in aggloLabelsTest]
    aggloAccuracy = accuracy_score(testOutputs, aggloTestOutputs)
    print("Agglomerative Clustering Accuracy:", aggloAccuracy)
    
    return dbscanAccuracy, aggloAccuracy

dbscanAccuracy, aggloAccuracy = alternativeClustering(trainFeatures, testFeatures, testOutputs, labelNames)

DBSCAN Accuracy: 0.3
Agglomerative Clustering Accuracy: 0.55
