# Sentiment Scores: Chunks

For system details and resources, see PhD_SystemCheck.ipynb and PhD_Modules.ipynb

In [1]:
from nltk.corpus import PlaintextCorpusReader

In [2]:
import pandas as pd

In [3]:
import re

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1100,
    chunk_overlap=380,
    length_function=len,
    is_separator_regex=False,
)

In [6]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [7]:
sentimentAnalyzer = SentimentIntensityAnalyzer() #Initialize sentiment scorer

In [9]:
lettersCorpus = PlaintextCorpusReader('letters/cleaned/', '.*txt') #Define corpus
len(lettersCorpus.fileids()) #How many files

576

In [12]:
sentimentChunk = pd.DataFrame(columns = ['chunk','docid','sequence','scoreNeg', 'scorePos', 'scoreNeu', 'scoreCompound']) # Create a new dataframe to hold sentences, letter id and compound scores
sentimentChunk.head()

Unnamed: 0,chunk,docid,sequence,scoreNeg,scorePos,scoreNeu,scoreCompound


In [13]:
for fileid in lettersCorpus.fileids(): # For each file in the corpus
    text = lettersCorpus.raw(fileid) # Place the string into the object "text"
    chunks = text_splitter.split_text(text)
    sequence = 0 # Create a counter to keep track of sentence order
    for chunk in chunks: # For each sentence in this list  
        sequence +=1 # Counter updater
        scores = sentimentAnalyzer.polarity_scores(chunk) # Calculate sentiment scores
        new_row = pd.Series([chunk, fileid, sequence, scores['neg'], scores['pos'], scores['neu'], scores['compound']], index=["chunk", "docid", "sequence", "scoreNeg", "scorePos", "scoreNeu", "scoreCompound"])
        sentimentChunk = pd.concat([sentimentChunk, new_row.to_frame().T], ignore_index=True) # Populate a new row in the dataframe with the dictionary info
    print(fileid) # Show progress

S1019-D002.txt
S1019-D004.txt
S1019-D005.txt
S1019-D006.txt
S1019-D007.txt
S1019-D008.txt
S1019-D009.txt
S1019-D010.txt
S1019-D011.txt
S1019-D012.txt
S1019-D013.txt
S1019-D014.txt
S1019-D015.txt
S1019-D016.txt
S1019-D017.txt
S1019-D018.txt
S1019-D019.txt
S1019-D020.txt
S1019-D021.txt
S1019-D022.txt
S1019-D023.txt
S1019-D024.txt
S1019-D025.txt
S1019-D026.txt
S1019-D027.txt
S1019-D028.txt
S1019-D029.txt
S1019-D030.txt
S1019-D031.txt
S1019-D032.txt
S1019-D033.txt
S1019-D034.txt
S1019-D035.txt
S1019-D036.txt
S1019-D037.txt
S1019-D038.txt
S1019-D040.txt
S1019-D041.txt
S1019-D042.txt
S1019-D043.txt
S1019-D044.txt
S1019-D045.txt
S1019-D046.txt
S1019-D047.txt
S1019-D048.txt
S1019-D049.txt
S1019-D050.txt
S1019-D051.txt
S1019-D052.txt
S1019-D053.txt
S1019-D054.txt
S1019-D055.txt
S1019-D056.txt
S1019-D057.txt
S1019-D058.txt
S2344-D038.txt
S2344-D039.txt
S2344-D040.txt
S2344-D041.txt
S2344-D042.txt
S2344-D043.txt
S2344-D044.txt
S2344-D045.txt
S2344-D046.txt
S2344-D048.txt
S2344-D049.txt
S2344-D051

S9873-D021.txt
S9908-D030.txt
S9908-D031.txt
S9908-D032.txt
S9908-D033.txt
S9908-D034.txt
S9908-D035.txt
S9908-D036.txt
S9908-D037.txt
S9908-D038.txt
S9908-D039.txt
S9912-D002.txt
S9912-D003.txt
S9912-D004.txt
S9913-D003.txt
S9913-D004.txt
S9913-D006.txt
S9913-D015.txt
S9913-D016.txt
S9957-D012.txt
S9957-D013.txt
S9957-D014.txt
S9957-D015.txt
S9974-D008.txt
S9974-D010.txt
S9974-D029.txt


In [14]:
sentimentChunk

Unnamed: 0,chunk,docid,sequence,scoreNeg,scorePos,scoreNeu,scoreCompound
0,"TRINIDAD On Train from Steubenville, Ohio, to ...",S1019-D002.txt,1,0.053,0.119,0.827,0.9425
1,Josephine. This letter thrilled us both. I was...,S1019-D002.txt,2,0.05,0.092,0.859,0.8625
2,them at two-thirty without one word of goodbye...,S1019-D002.txt,3,0.037,0.08,0.883,0.6977
3,This in substance was their conversation with ...,S1019-D002.txt,4,0.056,0.132,0.812,0.9451
4,three o'clock A M the baggage checker came thr...,S1019-D002.txt,5,0.059,0.148,0.793,0.9509
...,...,...,...,...,...,...,...
3780,by Chinamen are now sold in this market quite ...,S9974-D029.txt,7,0.074,0.128,0.798,0.9186
3781,high living was possible. To augment this prol...,S9974-D029.txt,8,0.104,0.04,0.856,-0.8997
3782,or a manufactury without steam. The Chinese ar...,S9974-D029.txt,9,0.146,0.018,0.836,-0.9746
3783,enough. The times are hard; there is much suff...,S9974-D029.txt,10,0.132,0.052,0.815,-0.9349


In [15]:
# Remove the .txt from the file name
sentimentChunk['docid'] = sentimentChunk['docid'].str.replace(r'.txt', '', regex=True)
sentimentChunk

Unnamed: 0,chunk,docid,sequence,scoreNeg,scorePos,scoreNeu,scoreCompound
0,"TRINIDAD On Train from Steubenville, Ohio, to ...",S1019-D002,1,0.053,0.119,0.827,0.9425
1,Josephine. This letter thrilled us both. I was...,S1019-D002,2,0.05,0.092,0.859,0.8625
2,them at two-thirty without one word of goodbye...,S1019-D002,3,0.037,0.08,0.883,0.6977
3,This in substance was their conversation with ...,S1019-D002,4,0.056,0.132,0.812,0.9451
4,three o'clock A M the baggage checker came thr...,S1019-D002,5,0.059,0.148,0.793,0.9509
...,...,...,...,...,...,...,...
3780,by Chinamen are now sold in this market quite ...,S9974-D029,7,0.074,0.128,0.798,0.9186
3781,high living was possible. To augment this prol...,S9974-D029,8,0.104,0.04,0.856,-0.8997
3782,or a manufactury without steam. The Chinese ar...,S9974-D029,9,0.146,0.018,0.836,-0.9746
3783,enough. The times are hard; there is much suff...,S9974-D029,10,0.132,0.052,0.815,-0.9349


In [16]:
# Add a column "chunks" showing total number of sentences in letter
sentimentChunk['chunks'] = sentimentChunk.groupby('docid')['sequence'].transform('max')
sentimentChunk

Unnamed: 0,chunk,docid,sequence,scoreNeg,scorePos,scoreNeu,scoreCompound,chunks
0,"TRINIDAD On Train from Steubenville, Ohio, to ...",S1019-D002,1,0.053,0.119,0.827,0.9425,15
1,Josephine. This letter thrilled us both. I was...,S1019-D002,2,0.05,0.092,0.859,0.8625,15
2,them at two-thirty without one word of goodbye...,S1019-D002,3,0.037,0.08,0.883,0.6977,15
3,This in substance was their conversation with ...,S1019-D002,4,0.056,0.132,0.812,0.9451,15
4,three o'clock A M the baggage checker came thr...,S1019-D002,5,0.059,0.148,0.793,0.9509,15
...,...,...,...,...,...,...,...,...
3780,by Chinamen are now sold in this market quite ...,S9974-D029,7,0.074,0.128,0.798,0.9186,11
3781,high living was possible. To augment this prol...,S9974-D029,8,0.104,0.04,0.856,-0.8997,11
3782,or a manufactury without steam. The Chinese ar...,S9974-D029,9,0.146,0.018,0.836,-0.9746,11
3783,enough. The times are hard; there is much suff...,S9974-D029,10,0.132,0.052,0.815,-0.9349,11


In [17]:
# Add a column "Position" showing the location of the chunk relative to the whole 
# That is, how far through the letter does the chunk appear?
sentimentChunk['position'] = sentimentChunk['sequence'] / sentimentChunk['chunks']
sentimentChunk

Unnamed: 0,chunk,docid,sequence,scoreNeg,scorePos,scoreNeu,scoreCompound,chunks,position
0,"TRINIDAD On Train from Steubenville, Ohio, to ...",S1019-D002,1,0.053,0.119,0.827,0.9425,15,0.066667
1,Josephine. This letter thrilled us both. I was...,S1019-D002,2,0.05,0.092,0.859,0.8625,15,0.133333
2,them at two-thirty without one word of goodbye...,S1019-D002,3,0.037,0.08,0.883,0.6977,15,0.2
3,This in substance was their conversation with ...,S1019-D002,4,0.056,0.132,0.812,0.9451,15,0.266667
4,three o'clock A M the baggage checker came thr...,S1019-D002,5,0.059,0.148,0.793,0.9509,15,0.333333
...,...,...,...,...,...,...,...,...,...
3780,by Chinamen are now sold in this market quite ...,S9974-D029,7,0.074,0.128,0.798,0.9186,11,0.636364
3781,high living was possible. To augment this prol...,S9974-D029,8,0.104,0.04,0.856,-0.8997,11,0.727273
3782,or a manufactury without steam. The Chinese ar...,S9974-D029,9,0.146,0.018,0.836,-0.9746,11,0.818182
3783,enough. The times are hard; there is much suff...,S9974-D029,10,0.132,0.052,0.815,-0.9349,11,0.909091


In [18]:
sentimentChunk.to_csv('20240220_PhD_SentimentChunk.csv', index = True)