In [1]:
# Author: Alexander Maksiaev
# Purpose: Sentiment analysis of textbooks 

In [2]:
# Housekeeping

import os
import pandas as pd
import csv 
import docx
from english_words import get_english_words_set
import enchant
import inflect

# Find real English words
web2lowerset = get_english_words_set(['web2'], lower=True)

# Switch into textbook directory
os.getcwd()

textbook_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP\Textbook_Dump_Transgender"

textbooks = os.listdir(textbook_dir)

In [3]:
# Clean scored dictionary for use

dictionary_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP\GitHub_DMP"

dictionary_location = os.chdir(dictionary_dir)
        
dictionary_clean = []
with open("final_dictionary.txt", newline = '\n') as dictionary: 
    dictionary_reader = csv.reader(dictionary, delimiter='\t')
    # Only include words that have scores
    for i in dictionary_reader:
        if i[1] != "0": # If the word has a score
            dictionary_clean.append(i)

# More cleanup
for i in dictionary_clean:
    i[1] = float(i[1])


In [4]:
# Go back to textbook directory 

os.chdir(textbook_dir)

# Function to get full text
def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return fullText

# Dictionary filled with text for all the books, minus the titles
title_text = {}
for book in textbooks:
    total_text = getText(book)
    text_without_title = total_text[4:]
    for piece in text_without_title:
        if piece == '':
            text_without_title.remove(piece) # Does not get rid of all whitespace, but ah well.
    title_text[book] = text_without_title
    

In [5]:
# Remove stop words from each book

stopwords_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP\GitHub_DMP\Stop_Words" 

os.chdir(stopwords_dir)

f = open("stop_words_english_modified.txt", "r", encoding="utf-8")
stopwords = []
for text in f:
    text = text.replace('\n', '')
    stopwords.append(text)


punc = '''!()[]{};:'-"\,<>./?@#$%^&*_~''' # Must include "-" in words... or not?

# Function to clean up text and remove stopwords
def clean(book):
    text_list = title_text[book]
    new_text_list = []
    
    # Clean up text
    for text in text_list:
        text = text.strip()
        text = text.lower()
        text = text.split(' ')
        new_text_list.append(text)


    newer_text_list = []
    
    # Remove punctuation
    for sentence in new_text_list:
        for word in sentence:
            for char in punc:
                if char in word:
                    word = word.replace(char, '')
            newer_text_list.append(word)
                
    # Remove stop words
    newest_text_list = []
    for words in newer_text_list:
        if words not in stopwords:
            newest_text_list.append(words)

    # Remove blanks
    for w in newest_text_list:
        if len(w) == 0:
            newest_text_list.remove(w)
            
    return newest_text_list

In [6]:
# Update all the books with their clean, stopword-less counterparts

clean_texts = {}
for book in title_text:
    newest_text_list = clean(book)
    clean_texts[book] = newest_text_list
    
print(clean_texts["Abn_Nolen-Hoeksema_05_Transgender.docx"])

['417', 'dsmivtr', 'criteria', 'diagnosis', 'gender', 'identity', 'disorder', 'people', 'gender', 'identity', 'disorder', 'born', 'wrong', 'sexs', 'genitals', 'members', 'sex', 'strong', 'persistent', 'identification', 'sex', 'children', 'manifested', '1', 'repeatedly', 'stated', 'desire', 'insistence', 'sex', '2', 'boys', 'preference', 'crossdressing', 'simulating', 'female', 'attire', 'girls', 'insistence', 'wearing', 'stereotypic', 'masculine', 'clothing', '3', 'strong', 'persistent', 'preferences', 'crosssex', 'roles', 'play', 'fantasies', '4', 'intense', 'desire', 'participate', 'stereotypic', 'games', 'pastimes', 'sex', '5', 'strong', 'preference', 'playmates', 'sex', 'adolescents', 'adults', 'identification', 'sex', 'manifested', 'symptoms', 'stated', 'desire', 'sex', 'frequently', 'passing', 'sex', 'desire', 'live', 'treated', 'sex', 'conviction', 'typical', 'feelings', 'reactions', 'sex', 'persistent', 'discomfort', 'sex', 'sense', 'inappropriateness', 'gender', 'role', 'sex',

In [7]:
# Function to find sentiment score of the cleaned textbooks

def get_sentiment(clean_textbook):
    total_score = 0
    for piece in clean_textbook:
        for word in dictionary_clean:
            if piece == word[0]: # If the word in the textbook is in the dictionary
                total_score += word[1] # Add that word's score to the total score
    total_score = total_score/(len(clean_textbook))
    return total_score

In [8]:
print(get_sentiment(clean_texts["Abn_Nolen-Hoeksema_05_Transgender.docx"]))

0.11996583481877626


In [9]:
# Find raw sentiments and see if length matters

def get_raw_sentiment(clean_textbook):
    total_score = 0
    for piece in clean_textbook:
        for word in dictionary_clean:
            if piece == word[0]: # If the word in the textbook is in the dictionary
                total_score += word[1] # Add that word's score to the total score
    return total_score

# Now, find sentiment score of all of the books

dmp_dir = r"C:\Users\maksi\Documents\UVA\Research\DMP\GitHub_DMP\Results"

os.chdir(dmp_dir)

sentiments = {}
for textbook in textbooks:
    book = clean_texts[textbook]
    sentiment = get_sentiment(book)
    sentiments[textbook] = sentiment
    
all_sentiments = pd.DataFrame(sentiments, index = [0])

all_sentiments.to_csv('sentiments_02_17_2024_transgender.csv', encoding='utf-8')