# Implementing Aspect Based Sentiment Analysis using Python

Link : https://medium.com/analytics-vidhya/aspect-based-sentiment-analysis-a-practical-approach-8f51029bbc4a
·


In [1]:
pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Standard data manipulation and analysis libraries
import pandas as pd
import numpy as np

# Natural Language Processing (NLP) libraries
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import stanfordnlp

# Text preprocessing and machine learning libraries
import matplotlib.pyplot as plt
import re
import stanza
import string
from transformers import pipeline
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

# Topic modeling and clustering libraries
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN

# Interpretability and explanation libraries
import shap

# Additional text processing libraries
import nltk.stem
from nltk.tokenize import word_tokenize, sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
stanfordnlp.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stanza.download('en') # This downloads the English models for the neural pipeline

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)

Default download directory: /Users/vitrac/stanfordnlp_resources
Hit enter to continue or type an alternate directory.

Downloading models for: en_ewt
Download location: /Users/vitrac/stanfordnlp_resources/en_ewt_models.zip


100%|██████████| 235M/235M [00:46<00:00, 5.01MB/s] 



Download complete.  Models saved to: /Users/vitrac/stanfordnlp_resources/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vitrac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/vitrac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vitrac/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 24.8MB/s]                    
2023-10-16 10:32:52 INFO: Downloading default packages for language: en (English) ...
2023-10-16 10:32:53 INFO: File exists: /Users/vitrac/stanza_resources/en/default.zip
2023-10-16 10:32:56 INFO: Finished downloading models and saved to /Users/vitrac/stanza_resources.


In [4]:
# Adapt to individual path
file_path = 'data/raw_data_healthcare.csv'
df = pd.read_csv(file_path)

def process_dataframe(df):
    if 'comment' not in df.columns:
        raise ValueError("DataFrame must contain a column named 'comment'")
    punctuations = string.punctuation
    df['medication'] = df['medication'].str.replace(f"[{re.escape(punctuations)}]", "", regex=True)

    # Extracting treatment name, treatment code, and disease name
    # Adjust the pattern to match unpunctuated strings
    pattern = r'(?P<treatment_name>.+?) (?P<treatment_code>.+?) for (?P<disease_name>.+?)( Maintenance)?$'

    extracted_data = df['medication'].str.extract(pattern)
    df['Treatment name'] = extracted_data['treatment_name']
    df['Treatment code'] = extracted_data['treatment_code']
    df['Disease'] = extracted_data['disease_name']

    # Replacing the values in the 'comment' column
    df['comment'] = df['comment'].replace(to_replace=extracted_data['treatment_name'].tolist(), value="Treatment", regex=True)
    df['comment'] = df['comment'].replace(to_replace=extracted_data['treatment_code'].tolist(), value="Treatment Code", regex=True)
    df['comment'] = df['comment'].replace(to_replace=extracted_data['disease_name'].tolist(), value="Disease", regex=True)    

    return df

# Preprocess the dataframe
df = process_dataframe(df)

# Display the dataframe
df.head()

Unnamed: 0,text_index,medication,rate,comment,Treatment name,Treatment code,Disease
0,0,Inflectra infliximab for Crohns Disease,1.0,Insurance forced me to switch to inflectra due...,Inflectra,infliximab,Crohns Disease
1,1,Remicade infliximab for Rheumatoid Arthritis,,My wife had RA from when she was a child. She ...,Remicade,infliximab,Rheumatoid Arthritis
2,2,Remicade infliximab for Ulcerative Colitis,1.0,This medication Treatment made me in constant ...,Remicade,infliximab,Ulcerative Colitis
3,3,Inflectra infliximab for Crohns Disease,1.0,"I have Fistulizing Crohn's, dx'd 6 yrs ago @ 3...",Inflectra,infliximab,Crohns Disease
4,4,Remicade infliximab for Ulcerative Colitis,10.0,Treatment literally gave me my life back 9 yrs...,Remicade,infliximab,Ulcerative Colitis


### 2 - Implementing Preprocessing for Aspect Based Sentiment Analysis 

In [17]:
def create_flagged_comment(taggedList):
    # Pour l'étape 3
    newwordList = []
    flag = 0
    for i in range(0,len(taggedList)-1):
        if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"):
            newwordList.append(taggedList[i][0]+taggedList[i+1][0])
            flag=1
        else:
            if(flag==1):
                flag=0
                continue
            newwordList.append(taggedList[i][0])
            if(i==len(taggedList)-2):
                newwordList.append(taggedList[i+1][0])
    finaltxt = ' '.join(word for word in newwordList)
    return finaltxt

def create_flagged_comment_get_newwordList(taggedList):
    # Pour l'étape 3
    newwordList = []
    flag = 0
    for i in range(0,len(taggedList)-1):
        if(taggedList[i][1]=="NN" and taggedList[i+1][1]=="NN"):
            newwordList.append(taggedList[i][0]+taggedList[i+1][0])
            flag=1
        else:
            if(flag==1):
                flag=0
                continue
            newwordList.append(taggedList[i][0])
            if(i==len(taggedList)-2):
                newwordList.append(taggedList[i+1][0])
    finaltxt = ' '.join(word for word in newwordList)
    return newwordList

def tokenize_and_pos_tag(comment):
    # Pour l'étape 4
    stop_words = set(stopwords.words('english'))
    new_txt_list = nltk.word_tokenize(comment)
    wordsList = [w for w in new_txt_list if not w in stop_words]
    taggedList = nltk.pos_tag(wordsList)
    return taggedList

nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English

def extract_dependencies(row):
    # Fonction pour extraire les dépendances syntaxiques
    # Pour l'étape 5
    sentence = row['flagged_comment']
    newwordList = row['newwordList']
    doc = nlp(sentence)
    dep_node = []
    for dep_edge in doc.sentences[0].dependencies:
        dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])
    for i in range(0, len(dep_node)):
        if int(dep_node[i][1]) != 0:
            source_index = int(dep_node[i][1]) - 1
            if 0 <= source_index < len(newwordList):
                dep_node[i][1] = newwordList[source_index]
            else:
                dep_node[i][1] = 'Out of range'
    return dep_node

def extract_features(df):
    df = df.copy()
    # Etape 5
    featureList = []  # To store features for each row
    categoriesList = []  # To store categories for each row
    
    for tagged_comment in df['tagged_comment']:
        features = []
        categories = []
        
        for word, pos in tagged_comment:
            if pos in ['JJ', 'NN', 'JJR', 'NNS', 'RB']:
                features.append([word, pos])
                categories.append(pos)
        
        featureList.append(features)
        categoriesList.append(categories)
    
    df['featureList'] = featureList
    df['categoriesList'] = categoriesList
    return df



2023-10-16 10:54:19 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 13.2MB/s]                    
2023-10-16 10:54:21 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-10-16 10:54:21 INFO: Using device: cpu
2023-10-16 10:54:21 INFO: Loading: tokenize
2023-10-16 10:54:21 INFO: Loading: pos
2023-10-16 10:54:22 INFO: Loading: lemma
2023-10-16 10:54:22 INFO: Loading: constituen

In [44]:
from tqdm import tqdm

def preprocessing_for_ABSA(df):

    stop_words = set(stopwords.words('english')) 
    
    # Étape 1: Tokeniser les commentaires
    df['tokenized_comments'] = df['comment'].str.lower()
    df['tokenized_comments'] = df['tokenized_comments'].str.replace(f"[{re.escape(string.punctuation)}]", "", regex=True)
    df['tokenized_comments'] = df['tokenized_comments'].apply(nltk.sent_tokenize)
    
    print('Step 1 : Done')

    # Étape 2: Tokenize each sentence into words, and tag each element
    df['tagged_comment'] = df['tokenized_comments'].apply(lambda sentences: [nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in sentences])
    df['tagged_comment'] = df['tagged_comment'].apply(lambda sentences: sentences[0])
    print('Step 2 : Done')

    # Étape 3: Appliquer la fonction create_flagged_comment à chaque ligne
    df['flagged_comment'] = df['tagged_comment'].apply(create_flagged_comment)
    df['newwordList'] = df['tagged_comment'].apply(create_flagged_comment_get_newwordList)
    print('Step 3 : Done')

    # Etape 4 : Appliquer la fonction pour créer la nouvelle colonne 'new_sentence'
    df['new_sentence'] = df['flagged_comment'].apply(tokenize_and_pos_tag)
    print('Step 4 : Done')

    # Etape 5 : Appliquer la fonction pour créer la nouvelle colonne 'dep_node'
    tqdm.pandas(desc="Extracting dependencies")
    df['dep_node'] = df.progress_apply(extract_dependencies, axis=1)
    print('Step 5 : Done')

    # Etape 6 : Extracting the features
    # We select only those sublists from the <dep_node> that could probably contain the features.
    df = extract_features(df)
    print('Step 6 : Done')

    return df

# Preprocess the dataframe
df_sample = df.iloc[:10].copy()
df_sample = preprocessing_for_ABSA(df_sample)
df_sample.head()


Step 1 : Done
Step 2 : Done
Step 3 : Done
Step 4 : Done


Extracting dependencies: 100%|██████████| 10/10 [00:39<00:00,  3.95s/it]

Step 5 : Done
Step 6 : Done





Unnamed: 0,text_index,medication,rate,comment,Treatment name,Treatment code,Disease,tokenized_comments,tagged_comment,flagged_comment,newwordList,new_sentence,dep_node,featureList,categoriesList
0,0,Inflectra infliximab for Crohns Disease,1.0,Insurance forced me to switch to inflectra due...,Inflectra,infliximab,Crohns Disease,[insurance forced me to switch to inflectra du...,"[(insurance, NN), (forced, VBD), (me, PRP), (t...",insurance forced me to switch to inflectra due...,"[insurance, forced, me, to, switch, to, inflec...","[(insurance, NN), (forced, VBD), (switch, NN),...","[[insurance, forced, nsubj], [forced, 0, root]...","[[insurance, NN], [due, JJ], [cheaper, JJR], [...","[NN, JJ, JJR, NN, NN, RB, NN, NN, NN, NNS, NN,..."
1,1,Remicade infliximab for Rheumatoid Arthritis,,My wife had RA from when she was a child. She ...,Remicade,infliximab,Rheumatoid Arthritis,[my wife had ra from when she was a child she ...,"[(my, PRP$), (wife, NN), (had, VBD), (ra, VBN)...",my wife had ra from when she was a child she h...,"[my, wife, had, ra, from, when, she, was, a, c...","[(wife, NN), (ra, NN), (child, NN), (used, VBN...","[[my, wife, nmod:poss], [wife, ra, nsubj], [ha...","[[wife, NN], [child, NN], [normal, JJ], [meds,...","[NN, NN, JJ, NNS, NN, NN, NN, JJ, JJ, NN, NN, ..."
2,2,Remicade infliximab for Ulcerative Colitis,1.0,This medication Treatment made me in constant ...,Remicade,infliximab,Ulcerative Colitis,[this medication treatment made me in constant...,"[(this, DT), (medication, NN), (treatment, NN)...",this medicationtreatment made me in constant p...,"[this, medicationtreatment, made, me, in, cons...","[(medicationtreatment, NN), (made, VBD), (cons...","[[this, medicationtreatment, det], [medication...","[[medication, NN], [treatment, NN], [constant,...","[NN, NN, JJ, NN, NN, NNS, NNS, NN, NN, RB, JJ,..."
3,3,Inflectra infliximab for Crohns Disease,1.0,"I have Fistulizing Crohn's, dx'd 6 yrs ago @ 3...",Inflectra,infliximab,Crohns Disease,[i have fistulizing crohns dxd 6 yrs ago 36yo...,"[(i, NNS), (have, VBP), (fistulizing, VBG), (c...",i have fistulizing crohnsdxd 6 yrs ago 36yo 11...,"[i, have, fistulizing, crohnsdxd, 6, yrs, ago,...","[(fistulizing, VBG), (crohnsdxd, NN), (6, CD),...","[[i, have, nsubj], [have, 0, root], [fistulizi...","[[i, NNS], [crohns, NN], [dxd, NN], [yrs, NN],...","[NNS, NN, NN, NN, RB, NNS, RB, JJ, NNS, NNS, N..."
4,4,Remicade infliximab for Ulcerative Colitis,10.0,Treatment literally gave me my life back 9 yrs...,Remicade,infliximab,Ulcerative Colitis,[treatment literally gave me my life back 9 yr...,"[(treatment, NN), (literally, RB), (gave, VBD)...",treatment literally gave me my life back 9 yrs...,"[treatment, literally, gave, me, my, life, bac...","[(treatment, NN), (literally, RB), (gave, VBD)...","[[treatment, gave, nsubj], [literally, gave, a...","[[treatment, NN], [literally, RB], [life, NN],...","[NN, RB, NN, RB, NN, NN, RB, JJ, NNS, NNS, NNS..."


In [45]:
def create_feature_clusters(df):
    feature_clusters = []
    full_feature_list = []

    for index, row in df.iterrows():
        dep_node = row['dep_node']
        features = row['featureList']
        full_feature_list.append(features)
        for feature, pos in features:
            feature_cluster = []
            for j in dep_node:
                if (j[0] == feature or j[1] == feature) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"]):
                    if j[0] == feature:
                        feature_cluster.append(j[1])
                    else:
                        feature_cluster.append(j[0])
            if feature_cluster:
                feature_clusters.append([feature, feature_cluster])
    # Flatten the full_feature_list
    flattened_feature_list = [item for sublist in full_feature_list for item in sublist]
    return feature_clusters, flattened_feature_list

feature_clusters, full_feature_list = create_feature_clusters(df_sample)

finalcluster = []
dic = {}
for i in full_feature_list:
    dic[i[0]] = i[1]
for i in feature_clusters:
    if(dic[i[0]]=="NN"):
        finalcluster.append(i)
finalcluster = [item for item in finalcluster if item[0] not in ['i', 'say', 'wife', 'child', 'japan', 'yrs']]
finalcluster = [item for item in finalcluster if len(item[1]) < 4]

print('Final cluster :')
print(finalcluster)

Final cluster :
[['insurance', ['forced']], ['insurance', ['forced']], ['treatment', ['symptoms', 'incurred']], ['side', ['effects']], ['treatment', ['symptoms', 'incurred']], ['diet', ['healthy']], ['bloody', ['stools']], ['treatment', ['symptoms', 'incurred']], ['flare', ['ups', 'ups']], ['inflammation', ['ups', 'preventing']], ['treatment', ['symptoms', 'incurred']], ['pain', ['ribpain', 'joint', 'shoulders']], ['pain', ['ribpain', 'joint', 'shoulders']], ['pain', ['ribpain', 'joint', 'shoulders']], ['fatigue', ['horsecramping']], ['thumb', ['move']], ['fair', ['this']], ['side', ['effectsreactions']], ['doctor', ['see']], ['symptom', ['get']], ['side', ['effectsreactions']], ['treatment', ['refusing']], ['pain', ['body', 'everywhere']], ['body', ['pain']], ['pain', ['body', 'everywhere']], ['rash', ['parts']], ['body', ['pain']], ['damage', ['nerve', 'diagnose']], ['damage', ['nerve', 'diagnose']], ['everyone', ['liartill']], ['doctor', ['proves']], ['want', ['i', 'really', 'go']],

In [46]:
print('Final cluster :')
for cluster in finalcluster:
    print(cluster)

Final cluster :
['insurance', ['forced']]
['insurance', ['forced']]
['treatment', ['symptoms', 'incurred']]
['side', ['effects']]
['treatment', ['symptoms', 'incurred']]
['diet', ['healthy']]
['bloody', ['stools']]
['treatment', ['symptoms', 'incurred']]
['flare', ['ups', 'ups']]
['inflammation', ['ups', 'preventing']]
['treatment', ['symptoms', 'incurred']]
['pain', ['ribpain', 'joint', 'shoulders']]
['pain', ['ribpain', 'joint', 'shoulders']]
['pain', ['ribpain', 'joint', 'shoulders']]
['fatigue', ['horsecramping']]
['thumb', ['move']]
['fair', ['this']]
['side', ['effectsreactions']]
['doctor', ['see']]
['symptom', ['get']]
['side', ['effectsreactions']]
['treatment', ['refusing']]
['pain', ['body', 'everywhere']]
['body', ['pain']]
['pain', ['body', 'everywhere']]
['rash', ['parts']]
['body', ['pain']]
['damage', ['nerve', 'diagnose']]
['damage', ['nerve', 'diagnose']]
['everyone', ['liartill']]
['doctor', ['proves']]
['want', ['i', 'really', 'go']]
['difference', ['make']]
['remic