# Sentiment Scores - Letters

Thank you to Stéfan Sinclair, Zoë Wilkinson Saldaña and Sunyam Bagga for code examples and inspiration.

In [10]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentimentAnalyzer = SentimentIntensityAnalyzer() #Initialize sentiment scorer

In [7]:
import nltk
from nltk import word_tokenize

Deleted the following files from the dataset because on close inspection they appeared to be more in the form of journal entries than letters: S9908-D039.txt, S9908-D038.txt, S9908-D037.txt, S9908-D036.txt, S9908-D035.txt, S9908-D034.txt, S9908-D033.txt, S9908-D032.txt, S9908-D031.txt, S9908-D030.txt

In [34]:
from nltk.corpus import PlaintextCorpusReader
lettersCorpus = PlaintextCorpusReader('lettersTest', '.*txt') #Define corpus
len(lettersCorpus.fileids()) #How many files

8

In [35]:
import pandas as pd
df = pd.DataFrame(columns = ['fileid','letterText','Sentiment']) # Create a new dataframe to hold sentences, letter id and compound scores
df.head()

Unnamed: 0,fileid,letterText,Sentiment


In [44]:
import re

for fileid in lettersCorpus.fileids(): # For each file in the corpus
    text = lettersCorpus.raw(fileid) # Place the string into the object "text"
    text = re.sub(r"(\. ){2,}","", text) # Remove ellipses
    text = re.sub(r"(\n+)"," ", text) # Replace blank lines with a single space
    text = re.sub(r"(&dot)","", text) # Remove this expression (a dot)
    text = re.sub(r"(— )","", text) # Remove dashes
    text = text.lower() # Make everything lowercase (because some docs in all caps)
    text = word_tokenize(text) # Place sentences into a list called "sentences"
    text = [word for word in text if word[0].isalpha()]
    scores = sentimentAnalyzer.polarity_scores(text) # Calculate sentiment scores
    new_row = {'fileid':fileid, 'letterText':text,'Sentiment':scores['compound']} # Place the sentence, letter id and compound score for sentence into a dictionary called "new_row" in which the key is the variable and the value is the data
    df = df.append(new_row, ignore_index=True) # Populate a new row in the dataframe with the dictionary info
    print(fileid) # Show progress

AttributeError: 'list' object has no attribute 'encode'

In [37]:
df.head()

Unnamed: 0,fileid,letterText,Sentiment
0,S530-D003.txt,"aug 9, 1913 dear friends, first of all, accep...",0.997
1,S530-D004.txt,"october 6 dear friends, we can't wait for a l...",0.9963
2,S530-D005.txt,"october 20, 1913 dear friends, we have been w...",0.997
3,S530-D006.txt,"dear ones, heartfelt greetings to all of you;...",0.9936
4,S530-D007.txt,"dear ones, i am sending this letter to you wi...",0.9991


In [38]:
sentiment = df
sentiment.head()

Unnamed: 0,fileid,letterText,Sentiment
0,S530-D003.txt,"aug 9, 1913 dear friends, first of all, accep...",0.997
1,S530-D004.txt,"october 6 dear friends, we can't wait for a l...",0.9963
2,S530-D005.txt,"october 20, 1913 dear friends, we have been w...",0.997
3,S530-D006.txt,"dear ones, heartfelt greetings to all of you;...",0.9936
4,S530-D007.txt,"dear ones, i am sending this letter to you wi...",0.9991


In [40]:
df['letterText']

0     aug 9, 1913 dear friends, first of all, accep...
1     october 6 dear friends, we can't wait for a l...
2     october 20, 1913 dear friends, we have been w...
3     dear ones, heartfelt greetings to all of you;...
4     dear ones, i am sending this letter to you wi...
5     may 3, 1922 dear friends, you are probably wo...
6     dear friends, we have already gotten the seco...
7     dear friends, first of all, greetings from al...
Name: letterText, dtype: object

In [42]:
list(df['letterText'].sample(2)) # Display a sample of 10 sentences

[' dear ones, i am sending this letter to you with heartfelt greetings. i always look forward to saturday, because we ride into town to see if we don\'t have any letters waiting for us there. and someone always remembers us and i am very glad of that. we believe what you say about the sale of our property, because i know myself how that goes; that the sale will not be made until the harvest, that we know, because i think you are constantly going to have to be concerned with the crops. but it can be sold after harvest, but we cannot determine the price for you; however, you have heard what they were offering me for it, so judge by that the more it brings, the better for us. we are leaving it completely up to you, uncle, because we know that you understand these things and that you will want to get the highest price. vavrín said that if you get a buyer other than florián, you should sell it for a hundred less. but if there isn\'t any other buyer, vavrín says florián should add some to th

In [43]:
df['letterText'].count()

8

In [30]:
# Add a column "Sentences" showing total number of sentences in letter
df['letterText'] = df.groupby('fileid')['Sequence'].transform('max')
df

Unnamed: 0,Sentence,fileid,Sequence,Sentiment,Sentences
0,baltimore 20 september 1836 dear heinrich: fr...,S10003-D023.txt,1,0.7263,54
1,it was a long and arduous voyage!,S10003-D023.txt,2,0.0000,54
2,however we were and are all well and thank the...,S10003-D023.txt,3,0.8777,54
3,i had written down the events of the voyage fo...,S10003-D023.txt,4,0.2263,54
4,"as you know, we set sail on 12 july.",S10003-D023.txt,5,0.0000,54
...,...,...,...,...,...
37603,"if there are norwegian ships in port, their ca...",S9983-D128.txt,13,0.4588,17
37604,"one brings some herring, another half a bottle...",S9983-D128.txt,14,0.2732,17
37605,we have a good time and sing our good old song...,S9983-D128.txt,15,0.8519,17
37606,"small, improvised parties of this kind help to...",S9983-D128.txt,16,0.8316,17


In [31]:
# Add a column "Position" showing the location of the sentence relative to the whole 
# That is, how far through the letter does the sentence appear?
df['Position'] = df['Sequence'] / df['Sentences']
df

Unnamed: 0,Sentence,fileid,Sequence,Sentiment,Sentences,Position
0,baltimore 20 september 1836 dear heinrich: fr...,S10003-D023.txt,1,0.7263,54,0.018519
1,it was a long and arduous voyage!,S10003-D023.txt,2,0.0000,54,0.037037
2,however we were and are all well and thank the...,S10003-D023.txt,3,0.8777,54,0.055556
3,i had written down the events of the voyage fo...,S10003-D023.txt,4,0.2263,54,0.074074
4,"as you know, we set sail on 12 july.",S10003-D023.txt,5,0.0000,54,0.092593
...,...,...,...,...,...,...
37603,"if there are norwegian ships in port, their ca...",S9983-D128.txt,13,0.4588,17,0.764706
37604,"one brings some herring, another half a bottle...",S9983-D128.txt,14,0.2732,17,0.823529
37605,we have a good time and sing our good old song...,S9983-D128.txt,15,0.8519,17,0.882353
37606,"small, improvised parties of this kind help to...",S9983-D128.txt,16,0.8316,17,0.941176


In [32]:
# Add a column "Last" showing whether the sentence is the last in a letter
df['Last'] = "False"
df.head()

Unnamed: 0,Sentence,fileid,Sequence,Sentiment,Sentences,Position,Last
0,baltimore 20 september 1836 dear heinrich: fr...,S10003-D023.txt,1,0.7263,54,0.018519,False
1,it was a long and arduous voyage!,S10003-D023.txt,2,0.0,54,0.037037,False
2,however we were and are all well and thank the...,S10003-D023.txt,3,0.8777,54,0.055556,False
3,i had written down the events of the voyage fo...,S10003-D023.txt,4,0.2263,54,0.074074,False
4,"as you know, we set sail on 12 july.",S10003-D023.txt,5,0.0,54,0.092593,False


In [33]:
import numpy as np
df['Last'] = np.where((df.Position == 1),'True',df.Last)
df

Unnamed: 0,Sentence,fileid,Sequence,Sentiment,Sentences,Position,Last
0,baltimore 20 september 1836 dear heinrich: fr...,S10003-D023.txt,1,0.7263,54,0.018519,False
1,it was a long and arduous voyage!,S10003-D023.txt,2,0.0000,54,0.037037,False
2,however we were and are all well and thank the...,S10003-D023.txt,3,0.8777,54,0.055556,False
3,i had written down the events of the voyage fo...,S10003-D023.txt,4,0.2263,54,0.074074,False
4,"as you know, we set sail on 12 july.",S10003-D023.txt,5,0.0000,54,0.092593,False
...,...,...,...,...,...,...,...
37603,"if there are norwegian ships in port, their ca...",S9983-D128.txt,13,0.4588,17,0.764706,False
37604,"one brings some herring, another half a bottle...",S9983-D128.txt,14,0.2732,17,0.823529,False
37605,we have a good time and sing our good old song...,S9983-D128.txt,15,0.8519,17,0.882353,False
37606,"small, improvised parties of this kind help to...",S9983-D128.txt,16,0.8316,17,0.941176,False


In [34]:
# Add a column "SentimentLTR" showing average sentiment for letter
df['SentimentLTR'] = df.groupby('fileid')['Sentiment'].transform('mean')
df

Unnamed: 0,Sentence,fileid,Sequence,Sentiment,Sentences,Position,Last,SentimentLTR
0,baltimore 20 september 1836 dear heinrich: fr...,S10003-D023.txt,1,0.7263,54,0.018519,False,0.171052
1,it was a long and arduous voyage!,S10003-D023.txt,2,0.0000,54,0.037037,False,0.171052
2,however we were and are all well and thank the...,S10003-D023.txt,3,0.8777,54,0.055556,False,0.171052
3,i had written down the events of the voyage fo...,S10003-D023.txt,4,0.2263,54,0.074074,False,0.171052
4,"as you know, we set sail on 12 july.",S10003-D023.txt,5,0.0000,54,0.092593,False,0.171052
...,...,...,...,...,...,...,...,...
37603,"if there are norwegian ships in port, their ca...",S9983-D128.txt,13,0.4588,17,0.764706,False,0.233053
37604,"one brings some herring, another half a bottle...",S9983-D128.txt,14,0.2732,17,0.823529,False,0.233053
37605,we have a good time and sing our good old song...,S9983-D128.txt,15,0.8519,17,0.882353,False,0.233053
37606,"small, improvised parties of this kind help to...",S9983-D128.txt,16,0.8316,17,0.941176,False,0.233053


In [14]:
df.to_csv('20210127_AM_sentimentLetters.csv', index = True)

## References

Moody, A., & Bagga, S. (2020, July 22-24). A comparative study of sentiment and topics in migration related tweets [Poster]. Digital Humanities 2020 Conference, Ottawa, ON, Canada.https://dh2020.adho.org/abstracts/

Saldaña, Z. W. (2018, January 15). Sentiment Analysis for Exploratory Data Analysis. The Programming Historian. https://doi.org/10.46430/phen0079

Sinclair, S. (2017, December 13). Sentiment Analysis. The Art of Literary Text Analysis. https://github.com/sgsinclair/alta/blob/915579fc1c6926b8fcb2a38f95349a2d6cba00b5/ipynb/SentimentAnalysis.ipynb