In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine as ce
from sqlalchemy import inspect
from pathlib import Path

# Part 0: Loading the files

In [None]:
DATA = Path("books.db")

In [None]:
engine = ce("sqlite:///"+str(DATA))

In [None]:
inspector = inspect(engine)

In [None]:
print(inspector.get_table_names())

['authors', 'book_file', 'book_original', 'books', 'text_files']


In [None]:
books_df = pd.read_sql("books", con = engine)
author_df = pd.read_sql("authors", con = engine)
book_file_df = pd.read_sql("book_file", con = engine)
text_file_df = pd.read_sql("text_files", con = engine)

# Part 1: Convert book files into sentences

In [None]:
# Convert the book text into lowercase
filtered_text = pd.DataFrame()
filtered_text['text'] = text_file_df['text'].apply(lambda x: list(filter(None, x.lower().split("\n"))))

In [None]:
gender_lst = ['woman', 'women', 'female', 'she', 'her', 'hers', 'man', 'men', 'male', 'he', 'his', 'him']
woman_lst = ['woman', 'women', 'female', 'she', 'her', 'hers']
man_lst = ['man', 'men', 'male', 'he', 'his', 'him']

In [None]:
# Filter out the sentences that contain gender words
filtered_text['text'] = filtered_text['text'].apply(lambda x: [sentence for sentence in x if any(word in sentence.split() for word in gender_lst)])

In [None]:
sentence_count = 0
for index, row in filtered_text.iterrows():
    sentence_count += len(row['text'])

In [None]:
sentence_count

2005472

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jiemintang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jiemintang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
wnl = WordNetLemmatizer()

In [None]:
home_category = ['home', 'homes', 'domestic', 'household', 'households', 'chore', 'chores', 'family', 'families']
work_category = ['work', 'works', 'worked', 'labor', 'worker', 'workers', 'economy', 'trade', 'trades', \
                 'business', 'businesses', 'job', 'jobs', 'company', 'companies', 'industry', 'industries', \
                 'pay', 'pays', 'paid', 'working', 'salary', 'salaries', 'wage', 'wages']
achievement_category = ['power', 'authority', 'authorities', 'achievement', 'control', 'controls', 'controlled',\
                        'won', 'win', 'wins', 'powerful', 'success', 'succeed', 'succeeded', 'successful',\
                        'better', 'effort', 'efforts', 'plan', 'plans', 'planned', 'try', 'tries', 'tried', 'leader']

In [None]:
filtered_text['text'] = filtered_text['text'].apply(lambda x: [sentence for sentence in x \
    if any(word in sentence.split() for word in home_category)\
    or any(word in sentence.split() for word in work_category)\
    or any(word in sentence.split() for word in achievement_category)])

In [None]:
sentence_count = 0
for index, row in filtered_text.iterrows():
    sentence_count += len(row['text'])

In [None]:
sentence_count

67701

In [None]:
sentences_df = pd.DataFrame(columns = ['text', 'gender', 'category'])

In [None]:
# Building the sentences with labels
for index, row in filtered_text.iterrows():
    for i in range(len(row['text'])):
        if any(word in row['text'][i].split() for word in woman_lst) and \
           any(word in row['text'][i].split() for word in man_lst):
            break
        elif any(word in row['text'][i].split() for word in woman_lst):
            curr_categories = []
            if any(word in row['text'][i].split() for word in home_category):
                curr_categories.append('home')
            if any(word in row['text'][i].split() for word in work_category):
                curr_categories.append('work')
            if any(word in row['text'][i].split() for word in achievement_category):
                curr_categories.append('achievement')
                
            sentences_df = sentences_df.append({'text': row['text'][i], 'gender' : 'woman', 'category': curr_categories}, ignore_index = True)
        elif any(word in row['text'][i].split() for word in man_lst):
            curr_categories = []
            if any(word in row['text'][i].split() for word in home_category):
                curr_categories.append('home')
            if any(word in row['text'][i].split() for word in work_category):
                curr_categories.append('work')
            if any(word in row['text'][i].split() for word in achievement_category):
                curr_categories.append('achievement')

            sentences_df = sentences_df.append({'text': row['text'][i], 'gender' : 'man', 'category': curr_categories}, ignore_index = True)

In [None]:
sentences_df.head(10)

Unnamed: 0,text,gender,category
0,till with that wealth he pays his soldiery.,man,[work]
1,"better to die than leave his banneret.""",man,[achievement]
2,"yet which might win they knew not, in his thou...",man,[achievement]
3,"then find him men, his company are worth.""",man,[work]
4,"at the sword's point he yet shall pay our meed.""",man,[work]
5,no man on earth has more nor better found.,man,[achievement]
6,better he loves murder and treachery,man,[achievement]
7,"soon as rollant his senses won and knew,",man,[achievement]
8,"i won for him with thee anjou, bretaigne,",man,[achievement]
9,"and won for him with thee peitou, the maine,",man,[achievement]


In [None]:
sentences_df.groupby('gender').count()

Unnamed: 0_level_0,text,category
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
man,36030,36030
woman,10024,10024


In [None]:
sentences_df.to_csv('cleaned_data.csv', index=False)

# Extra processing - Not Used

In [None]:
pre1980_author_df = author_df[author_df['death'] < int(1980)]

In [None]:
post1980_author_df = author_df[author_df['born'] >= int(1950)]

In [None]:
post1980_author_df

Unnamed: 0,index,author,born,death
3,3,Aesop,10000,10000
13,13,Anatole France,10000,10000
92,92,Frederick Engles,10000,10000
95,95,Friedrich Wieser,10000,10000
96,96,Fyodor Dostoevsky,10000,10000
129,129,Homer,10000,10000
140,140,Jacob Vanderlint,10000,1740
200,200,Nan,10000,10000
212,212,P. Cornelius Tacitus,10000,10000
216,216,Plato,10000,10000
