# Sentiment Scores: Diaries

In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from nltk import word_tokenize

import pandas as pd
import re
import numpy as np

In [2]:
tokenizer = RegexpTokenizer('\w+')
sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #Load tokenizer
sentimentAnalyzer = SentimentIntensityAnalyzer() #Initialize sentiment scorer

In [3]:
len(sentimentAnalyzer.lexicon)

7268

In [110]:
Corpus = PlaintextCorpusReader('test', '.*txt') #Define corpus
len(Corpus.fileids()) #How many files

1

Make a new folder called "cleaned" in the letters folder for the output.

# Building / testing the text cleaning script

In [17]:
text = open("test/D0002.txt", "r")
#text = open("test/test.txt", "r")
#text = open("test/D0002_periods.txt", "r")
#text = open("20240628_PhD_Diaries/cleaned/D0002.txt", "r")
text = text.read()
#text = text[:1374]
#text = text.split('9th weather')[0]
print(text[:1374])

"Charra"
Wednesday Nov 4th 1883 A nice cool 
day Mrs Roberts + I were talking nearly 
the whole night feel very tired today
Tom left just after breakfast for
 Mr Hiern's Station he will not be back
 until Sunday or Monday, he is talking
 of going out to the new Tanks over
 100 miles from here if he can make 
sure of getting water I Darned up the
 socks, + Stockings, I had with me + cut
 out some under clothes.
"Charra"
 Thursday Nov 8th North wind
first thing this morning with heavy
thunder but only a few drops of rain,
I was so sleepy did not get up till
very late had been hand sewing which
is very tiresome My throat keep very sore
still the Cold has not nearly left one
Worrier ill with cold our Black Boy
"Charra"
Friday Nov 9th We have had thunder
+ lightning with a little rain off + on all
day I have been sewing all day but Mrs
Roberts keeps me talking after going to bed
till all hours of the night so that I can't get
up in the morning, have had a head ache all
day from want of rest

In [12]:
text = re.sub(r"(\n+)"," ", text) # Replace blank lines with a single space
text = re.sub(r"(\[[^?]*?\])","", text) #Remove bracketed (i.e., transcriber) notes
text = re.sub(r"(\[)","", text) #Remove opening bracket
text = re.sub(r"(\?\])","", text) #Remove question mark and closing bracket
text = re.sub(r"(—){2,}"," ", text) # Replace 2 or more dashes with space
text = re.sub(r"(-){2,}"," ", text) # Replace 2 or more hyphens with space
text = re.sub(r"(&dot)","", text) # Remove this expression (a dot)
text = re.sub(r"(_)","", text) # Remove underscore
#text = re.sub(r"([Â,Ã])","", text) # Remove special characters ORIGINAL
text = re.sub(r"([Ã])","", text) # Remove special character REVISED
text = re.sub(r"([Â])","", text) # Remove special character REVISED
text = re.sub(r"(£)"," pounds ", text) # Replace pound symbol with word
text = re.sub(r"( & )"," and ", text) # Replace ampersand with word
text = re.sub(r"(\r)"," ", text) # Replace /r with blank space
text = re.sub(r"(\˙)","", text) # Remove dot
text = re.sub(r"(\#PAGE)","", text) #Remove #Page
text = re.sub(r"(\(sic\))","", text) #Remove #sic
text = re.sub(r"(\. ){2,}","", text) # Remove ellipses
text = re.sub(r"(\/)","", text) # Remove backslash
text = re.sub(r"(^\s*)","", text) # Remove blank spaces at the start of the string
text = re.sub(r"(\s){2,}"," ", text) # Replace 2 or more white spaces with just one
text = re.sub(r"$","\n", text) # Ensure that there is a newline to the end of the string
text = re.sub(r"(\+)","and", text) # Replace blank lines with a single space
text = re.sub(r"(\\)","", text) #Remove opening slash
text = re.sub(r"(\/)","", text) #Remove closing slash thus leaving contents
text = re.sub(r"(₤)"," pounds ", text) # Replace lira symbol with word pounds
text = re.sub(r"(\")","", text) # Quotation marks
print(text[:1374])

Charra Wednesday Nov 4th 1883 A nice cool day Mrs Roberts and I were talking nearly the whole night feel very tired today Tom left just after breakfast for Mr Hiern's Station he will not be back until Sunday or Monday, he is talking of going out to the new Tanks over 100 miles from here if he can make sure of getting water I Darned up the socks, and Stockings, I had with me and cut out some under clothes. Charra Thursday Nov 8th North wind first thing this morning with heavy thunder but only a few drops of rain, I was so sleepy did not get up till very late had been hand sewing which is very tiresome My throat keep very sore still the Cold has not nearly left one Worrier ill with cold our Black Boy Charra Friday Nov 9th We have had thunder and lightning with a little rain off and on all day I have been sewing all day but Mrs Roberts keeps me talking after going to bed till all hours of the night so that I can't get up in the morning, have had a head ache all day from want of rest and s

In [16]:
sentences = sentenceTokenizer.tokenize(text)
sentences[0:5]

["Charra Wednesday Nov 4th 1883 A nice cool day Mrs Roberts and I were talking nearly the whole night feel very tired today Tom left just after breakfast for Mr Hiern's Station he will not be back until Sunday or Monday, he is talking of going out to the new Tanks over 100 miles from here if he can make sure of getting water I Darned up the socks, and Stockings, I had with me and cut out some under clothes.",
 "Charra Thursday Nov 8th North wind first thing this morning with heavy thunder but only a few drops of rain, I was so sleepy did not get up till very late had been hand sewing which is very tiresome My throat keep very sore still the Cold has not nearly left one Worrier ill with cold our Black Boy Charra Friday Nov 9th We have had thunder and lightning with a little rain off and on all day I have been sewing all day but Mrs Roberts keeps me talking after going to bed till all hours of the night so that I can't get up in the morning, have had a head ache all day from want of rest

# Now calculate sentiment for the whole folder and the medium Vader lexicon

In [133]:
sentiment = pd.DataFrame(columns = ['text','docid', 'totalTokens', 'uniqueTokens', 'lexicalDiversity', 'scoreNeg', 'scoreNeu', 'scorePos', 'scoreCom']) # Create a new dataframe to hold sentences, letter id and compound scores
sentiment.head()

Unnamed: 0,text,docid,totalTokens,uniqueTokens,lexicalDiversity,scoreNeg,scoreNeu,scorePos,scoreCom


In [134]:
Corpus = PlaintextCorpusReader('test', '.*txt') #Define corpus
len(Corpus.fileids()) #How many files

1

In [135]:
for fileid in Corpus.fileids(): # For each file in the corpus
    #Proprocess and save text
    f = open("test/cleaned/" + fileid, "w", encoding='utf-8')
    text = Corpus.raw(fileid) # Place the string into the object "text"
    text = re.sub(r"(\n+)"," ", text) # Replace blank lines with a single space
    text = re.sub(r"(\[[^?]*?\])","", text) #Remove bracketed (i.e., transcriber) notes
    text = re.sub(r"(\[)","", text) #Remove opening bracket
    text = re.sub(r"(\?\])","", text) #Remove question mark and closing bracket
    text = re.sub(r"(—){2,}"," ", text) # Replace 2 or more dashes with space
    text = re.sub(r"(-){2,}"," ", text) # Replace 2 or more hyphens with space
    text = re.sub(r"(&dot)","", text) # Remove this expression (a dot)
    text = re.sub(r"(_)","", text) # Remove underscore
    text = re.sub(r"([Ã])","", text) # Remove special character
    text = re.sub(r"([Â])","", text) # Remove special character
    text = re.sub(r"(£)"," pounds ", text) # Replace pound symbol with word
    text = re.sub(r"( & )"," and ", text) # Replace ampersand with word
    text = re.sub(r"(\r)"," ", text) # Replace /r with blank space
    text = re.sub(r"(\˙)","", text) # Remove dot
    text = re.sub(r"(\#PAGE)","", text) #Remove #Page
    text = re.sub(r"(\(sic\))","", text) #Remove #sic
    text = re.sub(r"(\. ){2,}","", text) # Remove ellipses
    text = re.sub(r"(\/)","", text) # Remove backslash
    text = re.sub(r"(^\s*)","", text) # Remove blank spaces at the start of the string
    text = re.sub(r"(\s){2,}"," ", text) # Replace 2 or more white spaces with just one
    text = re.sub(r"$","\n", text) # Ensure that there is a newline to the end of the string
    text = re.sub(r"(\+)","and", text) # Replace blank lines with a single space
    text = re.sub(r"(\\)","", text) #Remove opening slash
    text = re.sub(r"(\/)","", text) #Remove closing slash thus leaving contents
    text = re.sub(r"(₤)"," pounds ", text) # Replace lira symbol with word pounds
    text = re.sub(r"(\")","", text) # Quotation marks
    text = re.sub(r"$","\n", text) # Ensure that there is a newline to the end of the string
    f.write(text)
    f.close()
    # Now basic metrics
    tokens = tokenizer.tokenize(text)
    totalTokens = len(tokens)
    uniqueTokens = len(set(tokens))
    lexicalDiversity = uniqueTokens/totalTokens
    # Now score sentiment
    sentences = sentenceTokenizer.tokenize(text) # Place sentences into a list called "sentences"
    scoreNeg = 0.0
    scoreNeu = 0.0
    scorePos = 0.0
    scoreCom = 0.0
    #sequence = 0 # Create a counter to keep track of sentence order
    for sentence in sentences: # For each sentence in the letter 
        #sequence +=1 # Counter updater
        scores = sentimentAnalyzer.polarity_scores(sentence) # Calculate sentiment scores
        scoreNeg += scores["neg"]
        scoreNeu += scores["neu"]
        scorePos += scores["pos"]
        scoreCom += scores["compound"]
    scoreNeg = scoreNeg / len(sentences)
    scoreNeu = scoreNeu / len(sentences)
    scorePos = scorePos / len(sentences)
    scoreCom = scoreCom / len(sentences)
    new_row = pd.Series([text, fileid, totalTokens, uniqueTokens, lexicalDiversity, scoreNeg, scoreNeu, scorePos, scoreCom], index=['text','docid', 'totalTokens', 'uniqueTokens', 'lexicalDiversity', 'scoreNeg', 'scoreNeu', 'scorePos', 'scoreCom'])
    sentiment = pd.concat([sentiment, new_row.to_frame().T], ignore_index=True)
    print(fileid) # Show progress
print("done")

D0002_periods.txt
done


In [136]:
sentiment

Unnamed: 0,text,docid,totalTokens,uniqueTokens,lexicalDiversity,scoreNeg,scoreNeu,scorePos,scoreCom
0,Charra Wednesday Nov 4th 1883. A nice cool day...,D0002_periods.txt,14055,1916,0.136322,0.052131,0.896698,0.051171,-0.012982


In [92]:
# Remove the .txt from the file name
sentiment['docid'] = sentiment['docid'].str.replace(r'.txt', '', regex=True)
sentiment

Unnamed: 0,text,docid,totalTokens,uniqueTokens,lexicalDiversity,scoreNeg,scoreNeu,scorePos,scoreCom
0,Charra Wednesday Nov 4th 1883 A nice cool day ...,D0002,81551,6885,0.084426,0.057039,0.89367,0.049316,-0.178757
1,May 6th Very wet morning it has stopped me fro...,D0003,19635,1939,0.098752,0.02814,0.916614,0.053863,0.070318
2,Diary of Capt. John Hart 1865 1 January 1 Sund...,D0007,54919,6332,0.115297,0.050477,0.896772,0.052744,-0.017036
3,Edith Gwynne July 14 71 Glynde Place July 17 B...,D0009,28146,5644,0.200526,0.039007,0.910292,0.050688,0.025624
