In [1]:
# Author: Alexander Maksiaev
# Purpose: Sentiment analysis of textbooks using Sentiwords



# SentiWords 1.1 Dictionary 
# Dictionary Source:
# Gatti, Lorenzo, Marco Guerini, and Marco Turchi. 
# "SentiWords: Deriving a high precision and high coverage lexicon for sentiment analysis." 
# IEEE Transactions on Affective Computing 7.4 (2016): 409-421.

In [2]:
# Housekeeping

import os
import pandas as pd
import csv 
import docx
from english_words import get_english_words_set
import enchant
import inflect
import nltk
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn
import numpy as np
# nltk.download("sentiwordnet")
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("averaged_perceptron_tagger")
# nltk.download("wordnet")

web2lowerset = get_english_words_set(['web2'], lower=True)

os.getcwd()

textbook_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP\Textbook_Dump"

textbooks = os.listdir(textbook_dir)

In [3]:
os.chdir(textbook_dir)

# Function to get full text
def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return fullText

# Dictionary filled with text for all the books, minus the titles
title_text = {}
for book in textbooks:
    total_text = getText(book)
    text_without_title = total_text[4:]
    for piece in text_without_title:
        if piece == '':
            text_without_title.remove(piece) # Does not get rid of all whitespace, but ah well.
#     text_without_title.remove('')
    title_text[book] = text_without_title

In [4]:
# print(title_text["Abn_Nolen-Hoeksema_05_Autism_v2.docx"])

In [5]:
# Remove stop words from each book

stopwords_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP\Stop_Words" 

os.chdir(stopwords_dir)

f = open("stop_words_english_original.txt", "r", encoding="utf-8")
stopwords = []
for text in f:
    text = text.replace('\n', '')
    stopwords.append(text)


punc = '''!()[]{};:'-"\,<>./?@#$%^&*_~''' # Must include "-" in words... or not?

# Function to clean up text and remove stopwords
def clean(book):
    text_list = title_text[book]
    new_text_list = []
    
    # Clean up text
    for text in text_list:
        text = text.strip()
        text = text.lower()
        text = text.split(' ')
        new_text_list.append(text)


    newer_text_list = []
    
    # Remove punctuation
    for sentence in new_text_list:
        for word in sentence:
            for char in punc:
                if char in word:
                    word = word.replace(char, '')
            newer_text_list.append(word)
                
    # Remove stop words
    newest_text_list = []
    for words in newer_text_list:
        if words not in stopwords:
            newest_text_list.append(words)

    # Remove blanks
    for w in newest_text_list:
        if len(w) == 0:
            newest_text_list.remove(w)
            
    return newest_text_list

In [6]:
# Update all the books with their clean, stopword-less counterparts

clean_texts = {}
for book in title_text:
    newest_text_list = clean(book)
    clean_texts[book] = newest_text_list
    


In [7]:
# print(clean_texts["Abn_Nolen-Hoeksema_05_Autism_v2.docx"])

In [8]:
nltk_to_sentiwordnet = {
    "NN": "n",
    "VB": "v",
    "JJ": "a",
    "RB": "r",
}

def get_sentiment(article):
    
#     sentences = nltk.sent_tokenize(article)
#     sentence_words = [nltk.word_tokenize(sentence) for sentence in sentences]
#     tagged_sentence_words = flatten(nltk.pos_tag_sents(sentence_words))
    
    word_sum = len(article)
    
    tagged_sentence_words = nltk.pos_tag(article)
    
    # Already did this
#     # Filter stopwords
#     tagged_sentence_words = [word for word in tagged_sentence_words if word[1] not in english_stopwords]
    
    pos_scores = []
    neg_scores = []
    subj_scores = []

    for word, pos in tagged_sentence_words:
        
        swn_pos = nltk_to_sentiwordnet.get(pos[:2], None)
        
        synsets = list(swn.senti_synsets(word.lower(), pos=swn_pos))

        
#         print(word)
        
#         if word == "autism" or word == "autistic" or word == "asperger's":
#             word_sum -= 1
#             swn_pos = None
#             synsets = 0
    
        if swn_pos == None:
            continue
    
            
        if len(synsets) == 0:
            continue
    
        #print("{}:".format(word))
        for synset in synsets[:1]:
            pos_scores.append(synset.pos_score())
            neg_scores.append(synset.neg_score())
            subj_scores.append(1 - synset.obj_score())
            
    # We only care about positive and negative scores
    positives = sum(pos_scores)
    negatives = sum(neg_scores)
            
        
    # Divide by length of text
    balanced_sentiment = (positives - negatives) 
    
    # Scale sentiment to the +4 -4 dichotomy 
    scaled_sentiment = balanced_sentiment * 4
    
    final_sentiment = scaled_sentiment / word_sum
        
#     pos_score = np.average(pos_scores, weights=subj_scores)
#     neg_score = np.average(neg_scores, weights=subj_scores)
#     neut_score = np.mean(subj_scores)
    
    
    
    
    
    return final_sentiment
    

In [9]:
get_sentiment(clean_texts["Abn_Nolen-Hoeksema_05_Autism_v2.docx"])

# Balance out negative and positive scores (positive, negative, objective)

-0.06820652173913043

In [81]:
# Now, find sentiment score of all of the books

dmp_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP"

os.chdir(dmp_dir)

print(textbooks)

sentiments = {}
for textbook in textbooks:
    book = clean_texts[textbook]
    sentiment = get_sentiment(book)
    sentiments[textbook] = sentiment
    
all_sentiments = pd.DataFrame(sentiments, index = [0])

all_sentiments.to_csv('sentiments_sentiwords_1_18_2023.csv', encoding='utf-8')

['Abn_Barlow_04_Autism_v2.docx', 'Abn_Barlow_05_Autism_v2.docx', 'Abn_Barlow_06_Autism_v2.docx', 'Abn_Barlow_07_Autism.docx', 'Abn_Barlow_08_Autism.docx', 'Abn_Brown_01_Autism_v2.docx', 'Abn_Brown_02_Autism_v2.docx', 'Abn_Brown_03_Autism_v2.docx', 'Abn_Brown_04_Autism_v2.docx', 'Abn_Brown_05_Autism.docx', 'Abn_Comer_05_Autism_v2.docx', 'Abn_Comer_06_Autism_v2.docx', 'Abn_Comer_07_Autism_v2.docx', 'Abn_Comer_08_Autism_v2.docx', 'Abn_Comer_09_Autism.docx', 'Abn_Hooley_13_Autism_v2.docx', 'Abn_Hooley_14_Autism_v2.docx', 'Abn_Hooley_15_Autism_v2.docx', 'Abn_Hooley_16_Autism_v2.docx', 'Abn_Hooley_17_Autism.docx', 'Abn_Kearney_06_Autism.docx', 'Abn_Mash_02_Autism_v2.docx', 'Abn_Mash_03_Autism_v2.docx', 'Abn_Mash_04_Autism_v2.docx', 'Abn_Mash_05_Autism_v2.docx', 'Abn_Mash_06_Autism.docx', 'Abn_Nevid_06_Autism_v2.docx', 'Abn_Nevid_07_Autism.docx', 'Abn_Nevid_08_Autism_v2.docx', 'Abn_Nevid_09_Autism_v2.docx', 'Abn_Nevid_10_Autism.docx', 'Abn_Nolen-Hoeksema_03_Autism_v2.docx', 'Abn_Nolen-Hoeksem