# Sentiment analysis of sentences holding two keywords from different themes

with combinations of emotions and technology words, and with afinn sentiment score based upon afinn's french word list: https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-fr-165.txt

In [5]:
import pandas as pd
import os
from afinn import Afinn
import re

In [7]:
os.chdir('..')

In [8]:
df = pd.read_csv(r'data/csv_files/text_data.csv', sep='|')

In [9]:
df1 = df[['year', 'author', 'title', 'text']]

# Function to split text into sentences
def split_sentences(text):
    # Split the text by period followed by a space
    return text.lsplit('. ')

# split the text in sentences
def sent_tokenizer(text):
    tokenized_sentences = text.split('.')
    tokenized_sentences = [s.lstrip().lower() for s in tokenized_sentences]
    return tokenized_sentences


# Use the explode method to expand the DataFrame
df_expanded = df1.assign(sentence=df1['text'].apply(sent_tokenizer)).explode('sentence')

# Reset index 
df_expanded = df_expanded.reset_index(drop=True)

df_expanded

Unnamed: 0,year,author,title,text,sentence
0,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,SPLENDEURS ET MISÈRESDES COURTISANES Laissez...,splendeurs et misèresdes courtisanes laissez...
1,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,SPLENDEURS ET MISÈRESDES COURTISANES Laissez...,n’est-il pas naturel de vous offrir les fleurs...
2,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,SPLENDEURS ET MISÈRESDES COURTISANES Laissez...,quand j’aurai quelques livres à publier qui po...
3,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,SPLENDEURS ET MISÈRESDES COURTISANES Laissez...,"le secret de cette démarche, tour à tour indol..."
4,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,SPLENDEURS ET MISÈRESDES COURTISANES Laissez...,"dans cet immense rendez-vous, la foule observe..."
...,...,...,...,...,...
442307,1927,Proust,Le Temps retrouve,Le Temps retrouvé (première partie)\n\nMarcel ...,la date à laquelle j’entendais le bruit de la ...
442308,1927,Proust,Le Temps retrouve,Le Temps retrouvé (première partie)\n\nMarcel ...,j’avais le vertige de voir au-dessous de moi e...
442309,1927,Proust,Le Temps retrouve,Le Temps retrouvé (première partie)\n\nMarcel ...,je venais de comprendre pourquoi le duc de gue...
442310,1927,Proust,Le Temps retrouve,Le Temps retrouvé (première partie)\n\nMarcel ...,je m’effrayais que les miennes fussent déjà si...


In [10]:
# subset the dataframe to reduce a column - not needed ('text')
df_expanded_sub = df_expanded[['year', 'author', 'title', 'sentence']]

# Define the Keyword Lists: Create two separate lists containing the keywords you want to check for in the sentences.
# Open two word lists
# 1. list
of = open(r'data\key_word_lists\emotion_list.txt', 'r', encoding='utf-8-sig')
emotion_list = of.read().lower().split('\n')
emotion_list = [i.strip() for i in emotion_list]
emotion_list = list(set(emotion_list))
emotion_list.sort()
of.close()

# 2. list
of = open(r'data\key_word_lists\technology_list.txt', 'r', encoding='utf-8-sig')
technology_list = of.read().lower().split('\n')
technology_list = [i.strip() for i in technology_list]
technology_list = list(set(technology_list))
technology_list.sort()
of.close()


# Function to check for keywords in a single sentence and return the found keywords as strings
def check_sentence_for_keywords(sentence, list1, list2):
    sentence_lower = sentence.lower()
    
    # Find keywords from list1 (emotion words)
    found_emotion_words = [word for word in list1 if word in sentence_lower]
    
    # Find keywords from list2 (technology words)
    found_technology_words = [word for word in list2 if word in sentence_lower]
    
    # Convert lists to comma-separated strings (empty string if no keywords found)
    emotion_string = ', '.join(found_emotion_words) if found_emotion_words else ''
    technology_string = ', '.join(found_technology_words) if found_technology_words else ''
    
    # Return the found keywords as strings
    return emotion_string, technology_string



# Apply the function to each sentence and get the found keywords
keyword_results = df_expanded_sub['sentence'].apply(lambda x: check_sentence_for_keywords(x, emotion_list, technology_list))

# Extract the results into separate columns
df_expanded_sub['emotion_word'] = keyword_results.apply(lambda x: x[0])  # First element (emotion words)
df_expanded_sub['technology_word'] = keyword_results.apply(lambda x: x[1])  # Second element (technology words)

# Create a boolean column for sentences that have keywords from both lists
df_expanded_sub['has_both_keywords'] = df_expanded_sub.apply(
    lambda row: len(row['emotion_word']) > 0 and len(row['technology_word']) > 0, axis=1
)

# Optional: Get sentences that contain keywords from both lists
matching_sentences = df_expanded_sub[df_expanded_sub['has_both_keywords'] == True]
print(f"Found {len(matching_sentences)} sentences with keywords from both lists")


Found 2469 sentences with keywords from both lists


In [11]:
matching_sentences

Unnamed: 0,year,author,title,sentence,emotion_word,technology_word,has_both_keywords
2,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,quand j’aurai quelques livres à publier qui po...,"beauté, souvenir",foyer,True
17,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,quoique au premier abord le plaisir et l’inqu...,"confus, plaisir",lustre,True
25,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,"néanmoins, sa mise, ses manières étaient irrép...",las,foyer,True
58,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,votre réponse? rastignac eut le vertige comme...,vertige,tige,True
151,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,voyez comme la dubarry va bien au dix-huitième...,oreille,foyer,True
...,...,...,...,...,...,...,...
442123,1927,Proust,Le Temps retrouve,et n’était-ce pas le grand-père de mlle de sai...,chagrin,swan,True
442128,1927,Proust,Le Temps retrouve,"enfin swann avait aimé la sœur de legrandin, l...",aimé,swan,True
442146,1927,Proust,Le Temps retrouve,"ce nez charmant, légèrement avancé en forme de...",charmant,swan,True
442287,1927,Proust,Le Temps retrouve,"et même, si je n’avais pas le loisir de prépar...","amour, paraître",lumière,True


In [12]:
# Append afinn sentiment score based upon afinn's french word list: https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-fr-165.txt
def afinn_sentiment(sentence):
    afinn = Afinn()
    return afinn.score(sentence)

matching_sentences['afinn_score'] = matching_sentences['sentence'].apply(lambda x: afinn_sentiment(x))
matching_sentences

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_sentences['afinn_score'] = matching_sentences['sentence'].apply(lambda x: afinn_sentiment(x))


Unnamed: 0,year,author,title,sentence,emotion_word,technology_word,has_both_keywords,afinn_score
2,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,quand j’aurai quelques livres à publier qui po...,"beauté, souvenir",foyer,True,6.0
17,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,quoique au premier abord le plaisir et l’inqu...,"confus, plaisir",lustre,True,-3.0
25,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,"néanmoins, sa mise, ses manières étaient irrép...",las,foyer,True,0.0
58,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,votre réponse? rastignac eut le vertige comme...,vertige,tige,True,0.0
151,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,voyez comme la dubarry va bien au dix-huitième...,oreille,foyer,True,1.0
...,...,...,...,...,...,...,...,...
442123,1927,Proust,Le Temps retrouve,et n’était-ce pas le grand-père de mlle de sai...,chagrin,swan,True,3.0
442128,1927,Proust,Le Temps retrouve,"enfin swann avait aimé la sœur de legrandin, l...",aimé,swan,True,2.0
442146,1927,Proust,Le Temps retrouve,"ce nez charmant, légèrement avancé en forme de...",charmant,swan,True,3.0
442287,1927,Proust,Le Temps retrouve,"et même, si je n’avais pas le loisir de prépar...","amour, paraître",lumière,True,1.0


In [13]:
# add positive, negative, or neutral cetegory
def apply_sentiment_cat(row):
    if row < 0:
        return 'neg'
    elif row == 0:
        return 'neu'
    elif row > 0:
        return 'pos'

matching_sentences['sentiment_cat'] = matching_sentences['afinn_score'].apply(lambda x : apply_sentiment_cat(x) )

matching_sentences

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_sentences['sentiment_cat'] = matching_sentences['afinn_score'].apply(lambda x : apply_sentiment_cat(x) )


Unnamed: 0,year,author,title,sentence,emotion_word,technology_word,has_both_keywords,afinn_score,sentiment_cat
2,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,quand j’aurai quelques livres à publier qui po...,"beauté, souvenir",foyer,True,6.0,pos
17,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,quoique au premier abord le plaisir et l’inqu...,"confus, plaisir",lustre,True,-3.0,neg
25,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,"néanmoins, sa mise, ses manières étaient irrép...",las,foyer,True,0.0,neu
58,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,votre réponse? rastignac eut le vertige comme...,vertige,tige,True,0.0,neu
151,1838,Balzac,Splendeurs et miseres des courtisanes 1 Esther...,voyez comme la dubarry va bien au dix-huitième...,oreille,foyer,True,1.0,pos
...,...,...,...,...,...,...,...,...,...
442123,1927,Proust,Le Temps retrouve,et n’était-ce pas le grand-père de mlle de sai...,chagrin,swan,True,3.0,pos
442128,1927,Proust,Le Temps retrouve,"enfin swann avait aimé la sœur de legrandin, l...",aimé,swan,True,2.0,pos
442146,1927,Proust,Le Temps retrouve,"ce nez charmant, légèrement avancé en forme de...",charmant,swan,True,3.0,pos
442287,1927,Proust,Le Temps retrouve,"et même, si je n’avais pas le loisir de prépar...","amour, paraître",lumière,True,1.0,pos


Do we get mostly positive, negative, or neutral sentenses from the combination of words describing sensations and technology?

In [14]:
matching_sentences['sentiment_cat'].value_counts()

sentiment_cat
pos    1353
neg     695
neu     421
Name: count, dtype: int64

In [15]:
matching_sentences[matching_sentences['technology_word'] == 'gaz']

Unnamed: 0,year,author,title,sentence,emotion_word,technology_word,has_both_keywords,afinn_score,sentiment_cat
9500,1842,Balzac,La Rabouilleuse,agathe joignit les mains et leva les yeux au c...,"poitrine, éblouissant",gaz,True,4.0,pos
14089,1842,Sue,Les Mysteres de Paris,germain n’a plus revu mademoiselle rigolette d...,"amour, amoureux, heureux",gaz,True,13.0,pos
16024,1842,Sue,Les Mysteres de Paris,"la terre foulée, le gazon arraché prouvent que...","entendre, poitrine",gaz,True,-18.0,neg
16056,1842,Sue,Les Mysteres de Paris,des milliers d’oiseaux gazouillent de temps à ...,"paroles, voix",gaz,True,0.0,neu
17486,1842,Sue,Les Mysteres de Paris,par une de ces idées industrieuses qui ne vien...,profond,gaz,True,1.0,pos
...,...,...,...,...,...,...,...,...,...
423428,1925,Gide,Les Faux monnayeurs,« “j’ai trouvé là l’explication que je chercha...,poitrine,gaz,True,0.0,neu
436717,1925,Proust,Albertine disparue,comme je suivais les allées séparées d’un sous...,souvenir,gaz,True,5.0,pos
438850,1927,Proust,Le Temps retrouve,saint-loup me parlait-il d’une mélodie de schu...,entendre,gaz,True,-3.0,neg
440860,1927,Proust,Le Temps retrouve,n’est-ce pas à mes sensations du genre de cell...,sensation,gaz,True,3.0,pos


The keywords in the positive sentences.

In [17]:
pos_sentiment = matching_sentences[matching_sentences['sentiment_cat'] == 'pos']

The table of the sensation keyword in the sentiments sorted by frequency.

In [18]:
pos_emo_words = pos_sentiment['emotion_word'].value_counts().to_frame().head(20)
pos_emo_words

Unnamed: 0_level_0,count
emotion_word,Unnamed: 1_level_1
rieur,58
las,57
voix,56
profond,48
gai,42
plaisir,37
désir,37
amour,34
souvenir,31
sourire,30


The table of the technology keywords in the sentiments sorted by frequency.

In [20]:
pos_tech_words = pos_sentiment['technology_word'].value_counts().to_frame().head(20)
pos_tech_words

Unnamed: 0_level_0,count
technology_word,Unnamed: 1_level_1
swan,313
lumière,250
gaz,88
tige,80
lampe,55
veilleuse,53
foyer,50
cierge,39
lustre,39
"lumière, lumières",35


The keywords in the negative sentences. 

In [21]:
neg_sentiment =  matching_sentences[matching_sentences['sentiment_cat'] == 'neg']

In [22]:
neg_emo_words = neg_sentiment['emotion_word'].value_counts().to_frame().head(20)
neg_emo_words

Unnamed: 0_level_0,count
emotion_word,Unnamed: 1_level_1
vertige,71
triste,36
voix,31
amour,30
rieur,25
profond,24
las,24
oreille,16
gai,15
heureux,15


In [23]:
neg_tech_words = neg_sentiment['technology_word'].value_counts().to_frame().head(20)
neg_tech_words

Unnamed: 0_level_0,count
technology_word,Unnamed: 1_level_1
tige,122
swan,107
lumière,87
gaz,42
lampe,40
cierge,33
foyer,27
lustre,22
bougie,19
"tige, tiges",15


A comparision. Which emotion words are only in positive sentences?

In [25]:
emo_words_compare = [w for w in list(pos_emo_words.index) if w not in list(neg_emo_words.index)]
emo_words_compare

['plaisir', 'sourire', 'charme', 'charmant', 'aimé', 'beauté', 'doux']

A comparision. Which sensation words are only in negative sentences?

In [26]:
emo_words_compare = [w for w in list(neg_emo_words.index) if w not in list(pos_emo_words.index)]
emo_words_compare

['vertige',
 'doucement',
 'entendre',
 'paroles',
 'poitrine',
 'confus',
 'chagrin']