In [1]:
# Author: Alexander Maksiaev
# Purpose: Sentiment analysis of textbooks 

In [2]:
# Housekeeping

import os
import pandas as pd
import csv 
import docx
from english_words import get_english_words_set
import enchant

# Find real English words
web2lowerset = get_english_words_set(['web2'], lower=True)

# Switch into textbook directory
os.getcwd()

textbook_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP\Textbook_Dump_Autism"

textbooks = os.listdir(textbook_dir)

In [3]:
# Clean scored dictionary for use

dictionary_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP\GitHub_DMP"

dictionary_location = os.chdir(dictionary_dir)
        
dictionary_clean = []
with open("final_dictionary_3_19_2024.txt", newline = '\n') as dictionary: 
    dictionary_reader = csv.reader(dictionary, delimiter='\t')
    # Only include words that have scores
    for i in dictionary_reader:
        if i[1] != "0": # If the word has a score
            dictionary_clean.append(i)

# More cleanup
for i in dictionary_clean:
    i[1] = float(i[1])


In [4]:
# Go back to textbook directory 

os.chdir(textbook_dir)

# Function to get full text
def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return fullText

# Dictionary filled with text for all the books, minus the titles
title_text = {}
for book in textbooks:
    total_text = getText(book)
    text_without_title = total_text[4:]
    for piece in text_without_title:
        if piece == '':
            text_without_title.remove(piece) # Does not get rid of all whitespace, but ah well.
    title_text[book] = text_without_title
    

In [5]:
print((title_text["Abn_Nolen-Hoeksema_05_Autism_v2.docx"]))

['p. 29: People who take a biological approach traditionally have not accepted a continuum model of abnormality, viewing psychological disorders (such as schizophrenia) as either present or absent much the way they view medical or physical disorders (such as cancer). More recently, however, those who use biological approaches have begun to view many disorders as part of a spectrum (Cannon & Keller, 2006). Several disorders may have similar symptoms of varying intensity. For example, recent research on autism, a disorder characterized by problems in communication, social skills, and activities and interests, suggests that it may be part of a spectrum of disorders with some of the same symptoms, but which vary in severity. A disorder on the less severe end of that spectrum may be Asperger\'s syndrome, which is characterized by some of the same problems in social skills and activities as autism but not by severe communication difficulties. Thus, researchers now often speak of the "autism 

In [6]:
# Remove stop words from each book

stopwords_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP\GitHub_DMP\Stop_Words" 

os.chdir(stopwords_dir)

f = open("stop_words_english_modified.txt", "r", encoding="utf-8")
stopwords = []
for text in f:
    text = text.replace('\n', '')
    stopwords.append(text)


punc = '''!()[]{};:'-"\,<>./?@#$%^&*_~''' # Must include "-" in words... or not?

# Function to clean up text and remove stopwords
def clean(book):
    text_list = title_text[book]
    new_text_list = []
    
    # Clean up text
    for text in text_list:
        text = text.strip()
        text = text.lower()
        text = text.split(' ')
        new_text_list.append(text)


    newer_text_list = []
    
    # Remove punctuation
    for sentence in new_text_list:
        for word in sentence:
            for char in punc:
                if char in word:
                    word = word.replace(char, '')
            newer_text_list.append(word)
                
    # Remove stop words
    newest_text_list = []
    for words in newer_text_list:
        if words not in stopwords:
            newest_text_list.append(words)

    # Remove blanks
    for w in newest_text_list:
        if len(w) == 0:
            newest_text_list.remove(w)
            
    return newest_text_list

In [7]:
# Update all the books with their clean, stopword-less counterparts

clean_texts = {}
for book in title_text:
    newest_text_list = clean(book)
    clean_texts[book] = newest_text_list
    
print(clean_texts["Abn_Nolen-Hoeksema_05_Autism_v2.docx"])

['29', 'people', 'biological', 'approach', 'traditionally', 'accepted', 'continuum', 'model', 'abnormality', 'viewing', 'psychological', 'disorders', 'schizophrenia', 'absent', 'view', 'medical', 'physical', 'disorders', 'cancer', 'biological', 'approaches', 'begun', 'view', 'disorders', 'spectrum', 'cannon', 'keller', '2006', 'disorders', 'symptoms', 'varying', 'intensity', 'disorder', 'characterized', 'problems', 'communication', 'social', 'skills', 'activities', 'interests', 'suggests', 'spectrum', 'disorders', 'symptoms', 'vary', 'severity', 'disorder', 'severe', 'spectrum', 'syndrome', 'characterized', 'problems', 'social', 'skills', 'activities', 'severe', 'communication', 'difficulties', 'researchers', 'speak', 'spectrum', 'disorders', 'sigman', 'spence', 'wang', '2006', 'disorders', 'spectrum', 'qualitatively', 'normal', 'functioning', '297', 'dr', 'temple', 'grandin', 'professor', 'animal', 'sciences', 'colorado', 'university', 'designed', 'onethird', 'livestockhandling', 'fac

In [8]:
# Function to find sentiment score of the cleaned textbooks

def get_sentiment(clean_textbook):
    total_score = 0
    for piece in clean_textbook:
        for word in dictionary_clean:
            if piece == word[0]: # If the word in the textbook is in the dictionary
                total_score += word[1] # Add that word's score to the total score
    total_score = total_score/(len(clean_textbook))
    return total_score

In [9]:
print(get_sentiment(clean_texts["Abn_Nolen-Hoeksema_05_Autism_v2.docx"]))

0.18750137163516206


In [10]:
# Find raw sentiments and see if length matters

def get_raw_sentiment(clean_textbook):
    total_score = 0
    for piece in clean_textbook:
        for word in dictionary_clean:
            if piece == word[0]: # If the word in the textbook is in the dictionary
                total_score += word[1] # Add that word's score to the total score
    return total_score

# Now, find sentiment score of all of the books

dmp_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP\GitHub_DMP\Results"

os.chdir(dmp_dir)

print(textbooks)

sentiments = {}
for textbook in textbooks:
    book = clean_texts[textbook]
    sentiment = get_sentiment(book)
    sentiments[textbook] = sentiment
    
all_sentiments = pd.DataFrame(sentiments, index = [0])

all_sentiments.to_csv('sentiments_3_19_2024_autism.csv', encoding='utf-8')

['Abn_Barlow_04_Autism_v2.docx', 'Abn_Barlow_05_Autism_v2.docx', 'Abn_Barlow_06_Autism_v2.docx', 'Abn_Barlow_07_Autism.docx', 'Abn_Barlow_08_Autism.docx', 'Abn_Brown_01_Autism_v2.docx', 'Abn_Brown_02_Autism_v2.docx', 'Abn_Brown_03_Autism_v2.docx', 'Abn_Brown_04_Autism_v2.docx', 'Abn_Brown_05_Autism.docx', 'Abn_Comer_05_Autism_v2.docx', 'Abn_Comer_06_Autism_v2.docx', 'Abn_Comer_07_Autism_v2.docx', 'Abn_Comer_08_Autism_v2.docx', 'Abn_Comer_09_Autism.docx', 'Abn_Hooley_13_Autism_v2.docx', 'Abn_Hooley_14_Autism_v2.docx', 'Abn_Hooley_15_Autism_v2.docx', 'Abn_Hooley_16_Autism_v2.docx', 'Abn_Hooley_17_Autism.docx', 'Abn_Kearney_06_Autism.docx', 'Abn_Mash_02_Autism_v2.docx', 'Abn_Mash_03_Autism_v2.docx', 'Abn_Mash_04_Autism_v2.docx', 'Abn_Mash_05_Autism_v2.docx', 'Abn_Mash_06_Autism.docx', 'Abn_Nevid_06_Autism_v2.docx', 'Abn_Nevid_07_Autism.docx', 'Abn_Nevid_08_Autism_v2.docx', 'Abn_Nevid_09_Autism_v2.docx', 'Abn_Nevid_10_Autism.docx', 'Abn_Nolen-Hoeksema_03_Autism_v2.docx', 'Abn_Nolen-Hoeksem