In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

%matplotlib inline
%config InlineBackend.feature_format = 'retina'

INFO:rdflib:RDFLib Version: 4.2.1


In [2]:
meta_df = pd.read_csv('gutenberg_metadata_en.csv', index_col = 'Unnamed: 0')

In [3]:
meta_df['subjects2'] = meta_df['subjects2'].map(lambda s: s.replace('[', '').replace(']', '').split(','))

In [4]:
meta_df['literature'] = meta_df['subjects2'].map(lambda sub: 1 if 'literature' in str(sub).lower().replace("'", '') else 0)
meta_df['fiction'] = meta_df['subjects2'].map(lambda sub: 1 if 'fiction' in str(sub).lower().replace("'", '') else 0)

In [5]:
lit_fiction = meta_df[(meta_df.literature == 1) | (meta_df.fiction == 1)].reset_index(drop = True)

In [6]:
from textstat.textstat import textstat

In [7]:
def dialogue_count(text):
    quotes = 0.0
    for char in text:
        if char == '"':
            quotes += 1
    quotes /= 2.0
    return quotes

In [8]:
def clean_sub_list(sub_list):
    return [sub.replace('"', '').replace("'", '').replace('(', '').replace(')', '').replace('.', '').strip().lower() 
            for sub in sub_list]

lit_fiction['subjects2'] = lit_fiction['subjects2'].map(clean_sub_list)

In [6]:
book_files = ['book_data_%d' %i for i in range(5, 24)]

In [24]:
%%time

import time
for i, books in zip(range(5, 24), book_files):
    print books
    texts = pd.read_csv(books, encoding = 'utf8', index_col = 'Unnamed: 0')
    texts.dropna(inplace=True)
    lit_fiction_df = pd.merge(lit_fiction, texts, on = 'id')
    lit_fiction_df['PZ'] = lit_fiction_df.LCC.map(lambda x: 1 if 'PZ' in x else 0)
    
    start = time.time()
    lit_fiction_df['dialogue'] = lit_fiction_df['text'].map(dialogue_count)
    print 'dialogue done', time.time() - start
    
    start = time.time()
    lit_fiction_df['sentence_count'] = lit_fiction_df.text.map(textstat.sentence_count)
    print 'sentence count done', time.time() - start
    
    lit_fiction_df['dialogue_per_sentence'] = lit_fiction_df.dialogue / lit_fiction_df.sentence_count
    
    start = time.time()
    lit_fiction_df['word_count'] = lit_fiction_df.text.map(textstat.lexicon_count)
    print 'word count done', time.time() - start
    
    start = time.time()
    lit_fiction_df['syllable_count'] = lit_fiction_df.text.map(textstat.syllable_count)
    print 'syllable count done', time.time() - start
    
    start = time.time()
    lit_fiction_df['character_count'] = lit_fiction_df.text.map(textstat.char_count)
    print 'character count done', time.time() - start
    
    lit_fiction_df['avg_sentence_length'] = lit_fiction_df['word_count'] / lit_fiction_df['sentence_count']
    
    lit_fiction_df['avg_syllables_per_word'] = lit_fiction_df['syllable_count'] / lit_fiction_df['word_count']
    
    lit_fiction_df['flesch_reading_ease'] = 206.835 - (1.015 * lit_fiction_df['avg_sentence_length']) - (84.6 * lit_fiction_df['avg_syllables_per_word'])
    
    lit_fiction_df['flesch_kincain_grade'] = (0.39 * lit_fiction_df['avg_sentence_length']) + (11.8 * lit_fiction_df['avg_syllables_per_word']) - 15.59
    
    lit_fiction_df['lit_LCC'] = lit_fiction_df['LCC'].map(lambda x: 1 if 'P' in x else 0)
    
    start = time.time()
    lit_fiction_df.to_csv('lit_fiction_%d' %i, encoding = 'utf8')
    print 'csv written', time.time() - start
    print '--------------------------------'

book_data_5
dialogue done 65.5569851398
sentence count done 755.665083885
word count done 740.307143927
syllable count done 862.200711012
character count done 2.19053697586
csv written 11.1352858543
--------------------------------
book_data_6
dialogue done 61.7364280224
sentence count done 731.738681793
word count done 711.680989027
syllable count done 836.206528187
character count done 2.11522698402
csv written 11.3417830467
--------------------------------
book_data_7
dialogue done 45.4811999798
sentence count done 556.469434023
word count done 545.373492002
syllable count done 637.062575102
character count done 1.58443689346
csv written 7.65705609322
--------------------------------
book_data_8
dialogue done 55.6863679886
sentence count done 672.79813385
word count done 656.378375053
syllable count done 771.27778101
character count done 1.94704818726
csv written 10.3010189533
--------------------------------
book_data_9
dialogue done 61.5568599701
sentence count done 713.954457045


In [2]:
meta_stats = pd.read_csv('lit_fiction_1', encoding = 'utf8', index_col = 'Unnamed: 0')

In [3]:
meta_stats.head(2)

Unnamed: 0,id,title,author,LCC,downloads,subjects,subjects2,formats,authoryearofbirth,authoryearofdeath,...,dialog_per_sentence,word_count,syllable_count,character_count,avg_sentence_length,avg_syllables_per_word,flesch_reading_ease,flesch_kincain_grade,lit_LCC,childrens_subjects
0,15,Moby Dick,"Melville, Herman",{PS},707,"{Ship captains -- Fiction, Whaling ships -- Fi...","[adventure stories, ahab, captain fictitious c...",{u'text/html': u'http://www.gutenberg.org/eboo...,1819.0,1891.0,...,0.0,1523,2428,8400,9.88961,1.594222,61.92587,7.078767,1,0
1,16,Peter Pan,"Barrie, J. M. (James Matthew)","{PZ, PR}",4778,"{Peter Pan (Fictitious character) -- Fiction, ...","[fairies, fantasy, fiction, juvenile fiction, ...",{u'text/plain; charset=utf-8': u'http://www.gu...,1860.0,1937.0,...,0.480354,47437,67117,213803,15.532744,1.414866,71.371599,7.163189,1,1


In [5]:
meta_stats.drop('text', axis = 1, inplace = True)

In [10]:
meta_stats.rename(columns = {'dialog_per_sentence':'dialogue_per_sentence'}, inplace = True)

In [12]:
meta_stats.columns

Index([                    u'id',                  u'title',
                       u'author',                    u'LCC',
                    u'downloads',               u'subjects',
                    u'subjects2',                u'formats',
            u'authoryearofbirth',      u'authoryearofdeath',
                         u'type',               u'language',
                   u'literature',                u'fiction',
                           u'PZ',            u'title_ascii',
                     u'dialogue',         u'sentence_count',
        u'dialogue_per_sentence',             u'word_count',
               u'syllable_count',        u'character_count',
          u'avg_sentence_length', u'avg_syllables_per_word',
          u'flesch_reading_ease',   u'flesch_kincain_grade',
                      u'lit_LCC',     u'childrens_subjects'],
      dtype='object')

In [172]:
df = pd.read_csv('lit_fiction_23', encoding = 'utf8', index_col = 'Unnamed: 0')

In [173]:
df.drop('text', axis = 1, inplace = True)

In [174]:
df.columns

Index([u'id', u'title', u'author', u'LCC', u'downloads', u'subjects',
       u'subjects2', u'formats', u'authoryearofbirth', u'authoryearofdeath',
       u'type', u'language', u'literature', u'fiction', u'PZ', u'dialogue',
       u'sentence_count', u'dialogue_per_sentence', u'word_count',
       u'syllable_count', u'character_count', u'avg_sentence_length',
       u'avg_syllables_per_word', u'flesch_reading_ease',
       u'flesch_kincain_grade', u'lit_LCC'],
      dtype='object')

In [175]:
for col in meta_stats.columns:
    if col not in df.columns:
        print col

childrens_subjects
title_ascii


In [176]:
def restrict_text(x):
    valid = 'abcdefghijklmnopqrstuvwxyz0123456789'
    return ''.join([ch for ch in x if ch.lower() in valid])

df['title_ascii'] = df.title.map(restrict_text)

In [177]:
meta_stats = meta_stats.append(df)

In [178]:
meta_stats.shape

(16035, 28)

In [180]:
meta_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16035 entries, 0 to 644
Data columns (total 28 columns):
LCC                       16035 non-null object
PZ                        16035 non-null int64
author                    16035 non-null object
authoryearofbirth         13428 non-null float64
authoryearofdeath         13176 non-null float64
avg_sentence_length       16035 non-null float64
avg_syllables_per_word    16035 non-null float64
character_count           16035 non-null int64
childrens_subjects        1396 non-null float64
dialogue                  16035 non-null float64
dialogue_per_sentence     16035 non-null float64
downloads                 16035 non-null int64
fiction                   16035 non-null int64
flesch_kincain_grade      16035 non-null float64
flesch_reading_ease       16035 non-null float64
formats                   16035 non-null object
id                        16035 non-null int64
language                  16035 non-null object
lit_LCC                   

In [3]:
meta_stats['subjects2'] = meta_stats['subjects2'].map(lambda s: s.replace('[', '').replace(']', '').split(',').lower())

In [205]:
meta_stats['childrens_subjects'] = meta_stats['subjects2'].map(lambda x: 1 if 'childrens literature' in [s.lower() for s in x] 
                               or 'childrens periodicals' in x 
                               or 'childrens plays' in x 
                               or 'childrens poetry' in x
                               or 'childrens songs' in x
                               or 'childrens stories' in x
                               or 'christian literature for children' in x
                               or 'juvenile and popular literature' in x
                               or 'juvenile drama' in x
                               or 'juvenile fction' in x
                               or 'juvenile fiction' in x
                               or 'juvenile literature' in x
                               or 'juvenile poetry' in x
                               else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [206]:
meta_stats.head(2)

Unnamed: 0,id,title,title_ascii,author,authoryearofbirth,authoryearofdeath,downloads,subjects,subjects2,LCC,...,word_count,sentence_count,syllable_count,avg_sentence_length,avg_syllables_per_word,dialogue,dialogue_per_sentence,flesch_reading_ease,flesch_kincain_grade,formats
0,15,Moby Dick,MobyDick,"Melville, Herman",1819.0,1891.0,707,"{Ship captains -- Fiction, Whaling ships -- Fi...","[adventure stories, ahab, captain fictitious...",{PS},...,1523,154,2428,9.88961,1.594222,0.0,0.0,61.92587,7.078767,{u'text/html': u'http://www.gutenberg.org/eboo...
1,16,Peter Pan,PeterPan,"Barrie, J. M. (James Matthew)",1860.0,1937.0,4778,"{Peter Pan (Fictitious character) -- Fiction, ...","[fairies, fantasy, fiction, juvenile fictio...","{PZ, PR}",...,47437,3054,67117,15.532744,1.414866,1467.0,0.480354,71.371599,7.163189,{u'text/plain; charset=utf-8': u'http://www.gu...


In [188]:
meta_stats.columns

Index([                   u'LCC',                     u'PZ',
                       u'author',      u'authoryearofbirth',
            u'authoryearofdeath',    u'avg_sentence_length',
       u'avg_syllables_per_word',        u'character_count',
           u'childrens_subjects',               u'dialogue',
        u'dialogue_per_sentence',              u'downloads',
                      u'fiction',   u'flesch_kincain_grade',
          u'flesch_reading_ease',                u'formats',
                           u'id',               u'language',
                      u'lit_LCC',             u'literature',
               u'sentence_count',               u'subjects',
                    u'subjects2',         u'syllable_count',
                        u'title',            u'title_ascii',
                         u'type',             u'word_count'],
      dtype='object')

In [194]:
meta_stats = meta_stats[['id', 'title', 'title_ascii', 'author', 'authoryearofbirth', 'authoryearofdeath', 'downloads', 
 'subjects', 'subjects2', 'LCC', 'PZ', 'lit_LCC', 'childrens_subjects', 'fiction', 'literature',
'character_count', 'word_count', 'sentence_count', 'syllable_count', 'avg_sentence_length', 
'avg_syllables_per_word','dialogue', 'dialogue_per_sentence', 'flesch_reading_ease', 
     'flesch_kincain_grade', 'formats']]

In [208]:
meta_stats.reset_index(inplace=True)

In [209]:
meta_stats.to_csv('meta_stats_all_notext', encoding = 'utf8')