In [1]:
import numpy as np
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
import nltk

import string
import os
import regex as re
from collections import defaultdict

In [2]:
punct = set(string.punctuation.replace('\\','').replace('|','').replace("'",''))

pos_punct_info = open("data/processed/output_POS.txt", 'a')

#check avg sent size
pos_punct_info.write("book_name|total_words|avg_sentence_size|"
                     + "!|#|\"|%|$|&|(|)|+|*|-|,|/|.|;|:|=|<|?|>|"
                     + "@|[|]|_|^|`|{|}|~|neg|neu|pos|compound|"
                     + "Title|Author|CC|CD|DT|EX|FW|IN|JJ|JJR|JJS|"
                     + "LS|MD|NN|NNP|NNPS|NNS|PDT|PRP|PRP$|RB|RBR|" 
                     + "RBS|RP|VB|VBD|VBG|VBP|VBN|WDT|VBZ|WRB|WP$|WP|")
pos_punct_info.write('\n') 


1

In [3]:
def punct_and_words(character_list):
    """
    Iterate through all characters. Count periods, punctuation frequencies. 
    word_count = words in sentence (resets to zero after a period). 
    total_words is the book's total word count.
    """
    punctuation_dict = defaultdict(int)
    sentence_count = 0
    word_count = 0
    period_count = 0
    avg_sent_size = 0
    total_words = 0
    punct_count = 0
    
    #sentence count
    for i in range(1, len(character_list)):
        #if letter followed by space or punct, then word count +=1
        if ((character_list[i] == " " or str(character_list[i]) in punct) and 
            str(character_list[i-1]) in string.ascii_letters):
            total_words += 1
        #count periods 
        if character_list[i] == ".":
            period_count += 1
        if character_list[i] in punct:
            punct_count += 1
            punctuation_dict[character_list[i]] += 1

            
    avg_sent_size = (total_words/period_count)
    #put together output, bar delimited
    pos_punct_info.write(str(total_words) + "|")
    pos_punct_info.write(str(avg_sent_size) + "|")
    
    for p in punct:
        s = ""
        if p in punctuation_dict:
            s = s + str(punctuation_dict[p] / punct_count) + "|"    #ratio of punct that is [x]
        else:
            s = s + str(0) + "|"                                     #0 if unused
        pos_punct_info.write(s)


In [4]:
def get_sentiment(temp):
    temp = temp.replace('\n', '')
    temp = temp.replace('\r', '')
    # tokenize sentences 
    content = tokenize.sent_tokenize(temp)
    
    #get author and title now that content is split by sentence 
    sid = SentimentIntensityAnalyzer()
    booksent = []
    for sentence in content:
        ss = sid.polarity_scores(sentence)
        ssarray = [ss['neg'], ss['neu'], ss['pos'], ss['compound']]
        booksent.append(ssarray)
    valuearray = np.array(booksent)
    # mean negative, neutral, positive, compound score for all lines in book
    values = np.mean(valuearray, axis=0)
    return values, booksent

In [5]:
# book = 'James_DevicesAndDesires.txt'
# with open("data/interim/" + book, 'r') as f:
#     content = f.read().rstrip('\n')
# get_sentiment(content)

In [6]:
def get_author(book_title):
    book_list = {'Agatha Christie': ['AndThenThereWereNone', 
                                     'DestinationUnknown', 
                                     'ElephantsCanRemember'], 
                    'Iris Murdoch': ['TheSandcastle', 
                                     'TheBlackPrince', 
                                     'JacksonsDilemma'], 
                      'P.D. James': ['DevicesAndDesires', 
                                     'DeathComesToPemberley', 
                                     'CoverHerFace']
                }
    
    for author, books in book_list.items():
        if book_title in books:
            return author

In [7]:
def pos_tagging(content):
    parts = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", 
             "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", 
             "NNS", "PDT", "PRP", "PRP$", "RB", "RBR", 
             "RBS", "RP", "VB", "VBD", "VBG",  "VBP", 
             "VBN", "WDT", "VBZ", "WRB", "WP$", "WP" ]
    # tokenize first
    text = nltk.word_tokenize(content)  
    results = nltk.pos_tag(text)
    
    #dict of {POS: count}
    results_dict = defaultdict(int)
    counter = 0
    for tag in results:
        token = tag[0]
        pos = tag[1]
        counter += 1
        results_dict[pos] += 1

    #write to file
    for part_of_sp in parts:
        s = ""
        if part_of_sp in results_dict:
            #percent of POS 
            s = s + str(results_dict[part_of_sp]/float(counter)) + "|"    
        else:
            s = s + str(0) + "|"  #0 if unused                               
        pos_punct_info.write(s)

In [8]:
def preprocessing():
    '''
    read file as a list of words
    set lowercase, stem, remove stopwords???
    get punctuation string for later feature extraction
    save local wordcount dict???
    save global word dict after finished looping through docs???
    '''
    for book in os.listdir("data/interim"):
        book_file = str(book)
        book_name = re.sub(r'(James_|Murdoch_|Christie_|\.txt)*', '', book_file)
        title = re.sub("([a-z])([A-Z])","\g<1> \g<2>", book_name)
        pos_punct_info.write(book_name + "|")
        
        with open("data/interim/" + book_file, 'r') as f:
            content = f.read().rstrip('\n')
            
        punct_and_words(content)
        sentiment_values, _ = get_sentiment(content)
        neg = sentiment_values[0]
        neu = sentiment_values[1]
        pos = sentiment_values[2]
        compound = sentiment_values[3]
        pos_punct_info.write(str(neg) + "|" 
                             + str(neu) + "|" 
                             + str(pos) + "|" 
                             + str(compound) + "|")
        
        title = re.sub("([a-z])([A-Z])","\g<1> \g<2>", book_name)
        author = get_author(book_name)
        pos_punct_info.write(title + "|" + author + "|")
        pos_tagging(content)
        pos_punct_info.write('\n')
        print(f'Done processing: {title}')                                                 
        f.close()


In [9]:
# book_name = 'Test Book'
# pos_punct_info.write(book_name + "|")
preprocessing()
pos_punct_info.close()

Done processing Devices And Desires
Done processing The Sandcastle
Done processing Destination Unknown
Done processing Elephants Can Remember
Done processing Death Comes To Pemberley
Done processing Cover Her Face
Done processing Jacksons Dilemma
Done processing And Then There Were None
Done processing The Black Prince


In [11]:
book_info_df = pd.read_csv("data/processed/output_POS.txt", 
                           delimiter='|',
                           index_col=False,
                           quoting=3, 
                           encoding='utf-8')

print(book_info_df.shape)
book_info_df

(9, 71)


Unnamed: 0,book_name,total_words,avg_sentence_size,!,#,"""",%,$,&,(,...,VBD,VBG,VBP,VBN,WDT,VBZ,WRB,WP$,WP,Unnamed: 70
0,DevicesAndDesires,157206,14.981988,0.0,0.0,0,0.0,0,0.0,0.0,...,0.067373,0.015981,0.022169,0.022631,0.003464,0.012071,0.005285,9.4e-05,0.004886,
1,TheSandcastle,113888,13.67367,0.0,0.0,0,0.0,0,0.0,0.0,...,0.083745,0.018261,0.014774,0.020327,0.00534,0.007362,0.003926,0.000117,0.004087,
2,DestinationUnknown,60461,10.251102,0.0,0.0,0,0.0,0,0.0,0.0,...,0.062762,0.012496,0.030155,0.017879,0.002633,0.016213,0.004376,3.9e-05,0.006028,
3,ElephantsCanRemember,60024,11.163102,0.0,0.0,0,0.0,0,0.0,0.0,...,0.064873,0.010579,0.040799,0.017196,0.004067,0.015221,0.005008,5.2e-05,0.009167,
4,DeathComesToPemberley,90969,19.236414,0.0,0.0,0,0.0,0,0.0,0.0,...,0.067453,0.015016,0.01246,0.032502,0.005054,0.010069,0.005618,0.000185,0.00451,
5,CoverHerFace,77527,13.32079,0.000875,0.0,0,8.8e-05,0,0.0,0.0,...,0.073576,0.014477,0.015781,0.026823,0.003134,0.010517,0.004963,0.000145,0.005231,
6,JacksonsDilemma,88190,17.110982,0.0,0.0,0,0.0,0,0.0,0.0,...,0.076244,0.021844,0.019245,0.021162,0.002862,0.008986,0.006288,8.2e-05,0.005434,
7,AndThenThereWereNone,52467,8.81502,0.000209,7e-05,0,0.0,0,0.0,0.001113,...,0.073138,0.013511,0.017264,0.01884,0.001952,0.013285,0.003753,3e-05,0.005569,
8,TheBlackPrince,132907,12.994427,0.0,0.0,0,0.0,0,4.8e-05,0.0,...,0.057163,0.01747,0.03265,0.016863,0.003349,0.01471,0.005086,5.4e-05,0.004866,
