In [237]:
import re, os, sys, shutil
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')

In [238]:
df = pd.read_csv('clean_all.csv')
df.head(5)

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
0,PG0,,,,,,,set(),Text
1,PG10000,The Magna Carta,Anonymous,,,['en'],167.0,"{'Magna Carta', 'Constitutional history -- Eng...",Text
2,PG10001,Apocolocyntosis,"Seneca, Lucius Annaeus",,65.0,['en'],212.0,"{'Claudius, Emperor of Rome, 10 B.C.-54 A.D. -...",Text
3,PG10002,The House on the Borderland,"Hodgson, William Hope",1877.0,1918.0,['en'],599.0,{'Science fiction'},Text
4,PG10003,"My First Years as a Frenchwoman, 1876-1879","Waddington, Mary King",1833.0,1923.0,['en'],16.0,"{'France -- History -- Third Republic, 1870-19...",Text


In [239]:
df.shape

(59220, 9)

In [240]:
# remove nulls
df = df.dropna(how='any',axis=0) 
df.shape

(42769, 9)

In [193]:
# remove none english
df = df[df.language == '[\'en\']']
df.shape

(33752, 9)

In [194]:
df.language.value_counts()

['en']    33752
Name: language, dtype: int64

In [195]:
# remove none text
df = df[df.type != 'text']
df.shape

(33752, 9)

In [196]:
# sort by downloads
df.sort_values("downloads", inplace = True, ascending=False) 
df.head(5)

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
3810,PG1342,Pride and Prejudice,"Austen, Jane",1775.0,1817.0,['en'],47860.0,"{'England -- Fiction', 'Young women -- Fiction...",Text
57551,PG84,"Frankenstein; Or, The Modern Prometheus","Shelley, Mary Wollstonecraft",1797.0,1851.0,['en'],25433.0,"{'Science fiction', 'Monsters -- Fiction', ""Fr...",Text
17142,PG2542,A Doll's House : a play,"Ibsen, Henrik",1828.0,1906.0,['en'],21753.0,"{'Man-woman relationships -- Drama', 'Wives --...",Text
59106,PG98,A Tale of Two Cities,"Dickens, Charles",1812.0,1870.0,['en'],21525.0,{'Paris (France) -- History -- 1789-1799 -- Fi...,Text
899,PG1080,A Modest Proposal: For preventing the children...,"Swift, Jonathan",1667.0,1745.0,['en'],21140.0,"{'Religious satire, English', 'Ireland -- Poli...",Text


In [197]:
# query publication year from wikipedia
import wikipedia, re, string, sys
def wiki(title):
    result = wikipedia.summary(title)
    year = re.search("publish.*([0-9]{4}s*)", result, flags=0)
    if year is not None:
        year = year.group(1)
    return year
wiki("Frankenstein; Or, The Modern Prometheus")

'1823'

In [241]:
df_curr = df[200:400]
df_curr.shape

(200, 9)

In [242]:
df_curr['publicationyear'] = None
for title in df_curr['title']:
    try:
        publicationyear = wiki(title)
    except:
        publicationyear = None
        pass
    df_curr.loc[df_curr['title'] == title, ['publicationyear']] = publicationyear

In [224]:
df_curr.head(10)

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,timeperiod,publicationyear
29096,PG3618_text.txt,Arms and the Man,"Shaw, Bernard",1856.0,1950.0,['en'],1630.0,"{'Soldiers -- Bulgaria -- Drama', 'Man-woman r...",Text,,1898.0
15887,PG242_text.txt,My Antonia,"Cather, Willa",1873.0,1947.0,['en'],1614.0,"{'Nebraska -- Fiction', 'Friendship -- Fiction...",Text,,1918.0
8442,PG175_text.txt,The Phantom of the Opera,"Leroux, Gaston",1868.0,1927.0,['en'],1607.0,{'French fiction -- Translations into English'...,Text,,1986.0
1145,PG11030_text.txt,"Incidents in the Life of a Slave Girl, Written...","Jacobs, Harriet A. (Harriet Ann)",1813.0,1897.0,['en'],1604.0,"{'Jacobs, Harriet A. (Harriet Ann), 1813-1897'...",Text,,1861.0
57856,PG8775_text.txt,Poems,"Hugo, Victor",1802.0,1885.0,['en'],1596.0,"{'French poetry -- Translations into English',...",Text,,
54565,PG59112_text.txt,R.U.R. (Rossum's Universal Robots): A Fantasti...,"Čapek, Karel",1890.0,1938.0,['en'],1590.0,set(),Text,,
56827,PG7849_text.txt,The Trial,"Kafka, Franz",1883.0,1924.0,['en'],1586.0,{'Social problems -- Fiction'},Text,,1925.0
694,PG10625_text.txt,A Concise Dictionary of Middle English from A....,"Skeat, Walter W. (Walter William)",1835.0,1912.0,['en'],1584.0,"{'English language -- Middle English, 1100-150...",Text,,
5221,PG146_text.txt,A Little Princess: Being the whole story of Sa...,"Burnett, Frances Hodgson",1849.0,1924.0,['en'],1579.0,"{'Orphans -- Fiction', 'Boarding schools -- Fi...",Text,,2012.0
57384,PG834_text.txt,The Memoirs of Sherlock Holmes,"Doyle, Arthur Conan",1859.0,1930.0,['en'],1578.0,"{'Holmes, Sherlock (Fictitious character) -- F...",Text,,1893.0


In [225]:
# get processed data from top of the dataframe
df_curr.shape

(200, 11)

In [226]:
# remove nulls in publicationyear
df_curr = df_curr.dropna(subset=['publicationyear'],axis=0) 
df_curr.shape

(84, 11)

In [227]:
# get time period
# 1751-1800 0
# 1801-1820 1
# 1821-1840 2
# 1841-1860 3
# 1861-1880 4
# 1881-1900 5
# 1901-1920 6
time_period_lower_bounds = [1751, 1801, 1821, 1841, 1861, 1881, 1901, 2000]
def get_time_period_category(year):
    year = int(year)

    if (year < 1751 or year > 1920):
        return None
    else:
        for i in range(0, 6):
            if (year >= time_period_lower_bounds[i] and year < time_period_lower_bounds[i+1]):
                return i

In [228]:
df['timeperiod'] = None
for year in df_curr['publicationyear']:
    try:
        category = get_time_period_category(year)
    except:
        category = None
        pass
    df_curr.loc[df_curr['publicationyear'] == year, 'timeperiod'] = category
df_curr.head(6)

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,timeperiod,publicationyear
29096,PG3618_text.txt,Arms and the Man,"Shaw, Bernard",1856.0,1950.0,['en'],1630.0,"{'Soldiers -- Bulgaria -- Drama', 'Man-woman r...",Text,5.0,1898
15887,PG242_text.txt,My Antonia,"Cather, Willa",1873.0,1947.0,['en'],1614.0,"{'Nebraska -- Fiction', 'Friendship -- Fiction...",Text,,1918
8442,PG175_text.txt,The Phantom of the Opera,"Leroux, Gaston",1868.0,1927.0,['en'],1607.0,{'French fiction -- Translations into English'...,Text,,1986
1145,PG11030_text.txt,"Incidents in the Life of a Slave Girl, Written...","Jacobs, Harriet A. (Harriet Ann)",1813.0,1897.0,['en'],1604.0,"{'Jacobs, Harriet A. (Harriet Ann), 1813-1897'...",Text,4.0,1861
56827,PG7849_text.txt,The Trial,"Kafka, Franz",1883.0,1924.0,['en'],1586.0,{'Social problems -- Fiction'},Text,,1925
5221,PG146_text.txt,A Little Princess: Being the whole story of Sa...,"Burnett, Frances Hodgson",1849.0,1924.0,['en'],1579.0,"{'Orphans -- Fiction', 'Boarding schools -- Fi...",Text,,2012


In [229]:
# remove nulls in age
df_curr = df_curr.dropna(how='any',axis=0) 
df_curr.shape

(40, 11)

In [230]:
# add age column (age = publicationyear - authoryearofbirth)
df_curr['publicationyear'] = df_curr['publicationyear'].astype('int')
df_curr.head(6)

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,timeperiod,publicationyear
29096,PG3618_text.txt,Arms and the Man,"Shaw, Bernard",1856.0,1950.0,['en'],1630.0,"{'Soldiers -- Bulgaria -- Drama', 'Man-woman r...",Text,5,1898
1145,PG11030_text.txt,"Incidents in the Life of a Slave Girl, Written...","Jacobs, Harriet A. (Harriet Ann)",1813.0,1897.0,['en'],1604.0,"{'Jacobs, Harriet A. (Harriet Ann), 1813-1897'...",Text,4,1861
57384,PG834_text.txt,The Memoirs of Sherlock Holmes,"Doyle, Arthur Conan",1859.0,1930.0,['en'],1578.0,"{'Holmes, Sherlock (Fictitious character) -- F...",Text,5,1893
53774,PG583_text.txt,The Woman in White,"Collins, Wilkie",1824.0,1889.0,['en'],1531.0,"{'Deception -- Fiction', 'Country homes -- Fic...",Text,3,1859
58562,PG940_text.txt,The Last of the Mohicans; A narrative of 1757,"Cooper, James Fenimore",1789.0,1851.0,['en'],1423.0,"{'Frontier and pioneer life -- Fiction', 'Unit...",Text,0,1757
58939,PG974_text.txt,The Secret Agent: A Simple Tale,"Conrad, Joseph",1857.0,1924.0,['en'],1409.0,"{'Anarchists -- Fiction', 'London (England) --...",Text,5,1886


In [231]:
df_curr['age'] = df_curr['publicationyear'] - df_curr['authoryearofbirth']
df_curr.head(5)

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,timeperiod,publicationyear,age
29096,PG3618_text.txt,Arms and the Man,"Shaw, Bernard",1856.0,1950.0,['en'],1630.0,"{'Soldiers -- Bulgaria -- Drama', 'Man-woman r...",Text,5,1898,42.0
1145,PG11030_text.txt,"Incidents in the Life of a Slave Girl, Written...","Jacobs, Harriet A. (Harriet Ann)",1813.0,1897.0,['en'],1604.0,"{'Jacobs, Harriet A. (Harriet Ann), 1813-1897'...",Text,4,1861,48.0
57384,PG834_text.txt,The Memoirs of Sherlock Holmes,"Doyle, Arthur Conan",1859.0,1930.0,['en'],1578.0,"{'Holmes, Sherlock (Fictitious character) -- F...",Text,5,1893,34.0
53774,PG583_text.txt,The Woman in White,"Collins, Wilkie",1824.0,1889.0,['en'],1531.0,"{'Deception -- Fiction', 'Country homes -- Fic...",Text,3,1859,35.0
58562,PG940_text.txt,The Last of the Mohicans; A narrative of 1757,"Cooper, James Fenimore",1789.0,1851.0,['en'],1423.0,"{'Frontier and pioneer life -- Fiction', 'Unit...",Text,0,1757,-32.0


In [232]:
# age category
# 18-24    0
# 25-34    1
# 35-49    2
# 50-64    3
# 65+    4
age_lower_bounds = [18, 25, 35, 50, 65, 100]
def get_age_category(year):
    year = int(year)
    if (year < 18):
        return None
    else:
        for i in range(0, 5):
            if (year >= age_lower_bounds[i] and year < age_lower_bounds[i+1]):
                return i

In [233]:
df_curr['agecategory'] = None
i = 0
for year in df_curr['age']:
    try:
        age_category = get_age_category(year)
    except:
        age_category = None
        pass
    df_curr.loc[df_curr['age'] == year, ['agecategory']] = age_category
df_curr.head(10)

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,timeperiod,publicationyear,age,agecategory
29096,PG3618_text.txt,Arms and the Man,"Shaw, Bernard",1856.0,1950.0,['en'],1630.0,"{'Soldiers -- Bulgaria -- Drama', 'Man-woman r...",Text,5,1898,42.0,2.0
1145,PG11030_text.txt,"Incidents in the Life of a Slave Girl, Written...","Jacobs, Harriet A. (Harriet Ann)",1813.0,1897.0,['en'],1604.0,"{'Jacobs, Harriet A. (Harriet Ann), 1813-1897'...",Text,4,1861,48.0,2.0
57384,PG834_text.txt,The Memoirs of Sherlock Holmes,"Doyle, Arthur Conan",1859.0,1930.0,['en'],1578.0,"{'Holmes, Sherlock (Fictitious character) -- F...",Text,5,1893,34.0,1.0
53774,PG583_text.txt,The Woman in White,"Collins, Wilkie",1824.0,1889.0,['en'],1531.0,"{'Deception -- Fiction', 'Country homes -- Fic...",Text,3,1859,35.0,2.0
58562,PG940_text.txt,The Last of the Mohicans; A narrative of 1757,"Cooper, James Fenimore",1789.0,1851.0,['en'],1423.0,"{'Frontier and pioneer life -- Fiction', 'Unit...",Text,0,1757,-32.0,
58939,PG974_text.txt,The Secret Agent: A Simple Tale,"Conrad, Joseph",1857.0,1924.0,['en'],1409.0,"{'Anarchists -- Fiction', 'London (England) --...",Text,5,1886,29.0,1.0
55720,PG6852_text.txt,Venus in Furs,"Sacher-Masoch, Leopold, Ritter von",1835.0,1895.0,['en'],1389.0,"{'Sadomasochism -- Fiction', 'Erotic stories'}",Text,4,1870,35.0,2.0
34936,PG41445_text.txt,"Frankenstein; Or, The Modern Prometheus","Shelley, Mary Wollstonecraft",1797.0,1851.0,['en'],1359.0,"{'Science fiction', 'Monsters -- Fiction', ""Fr...",Text,2,1823,26.0,1.0
53441,PG580_text.txt,The Pickwick Papers,"Dickens, Charles",1812.0,1870.0,['en'],1350.0,"{'Men -- Societies and clubs -- Fiction', 'Hum...",Text,2,1836,24.0,0.0
32374,PG3913_text.txt,The Confessions of Jean Jacques Rousseau — Com...,"Rousseau, Jean-Jacques",1712.0,1778.0,['en'],1279.0,"{'Authors, French -- 18th century -- Biography...",Text,0,1782,70.0,4.0


In [234]:
# remove nulls in age category
df_curr = df_curr.dropna(how='any',axis=0) 
df_curr.shape

(38, 13)

In [None]:
# remove = []
# for index in remove:
#     index+=str(i)

In [47]:
# # manually remove none fiction
# df_curr = df_curr[~df_curr['txt'].isin(remove)]
# df_curr.shape

(1029, 3)

In [235]:
for txt in df_curr['id']:
    try:
        os.rename(os.path.join("text", str(txt + "_text.txt")), os.path.join("clean", str(txt + "_text.txt")))
    except FileNotFoundError as err:
        df_curr = df_curr[~df_curr['id'].isin([txt])]

In [222]:
df_curr['id'] = df_curr['id'].astype(str) + '_text.txt'
df_curr.head()

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,timeperiod,publicationyear
29096,PG3618_text.txt,Arms and the Man,"Shaw, Bernard",1856.0,1950.0,['en'],1630.0,"{'Soldiers -- Bulgaria -- Drama', 'Man-woman r...",Text,,1898.0
15887,PG242_text.txt,My Antonia,"Cather, Willa",1873.0,1947.0,['en'],1614.0,"{'Nebraska -- Fiction', 'Friendship -- Fiction...",Text,,1918.0
8442,PG175_text.txt,The Phantom of the Opera,"Leroux, Gaston",1868.0,1927.0,['en'],1607.0,{'French fiction -- Translations into English'...,Text,,1986.0
1145,PG11030_text.txt,"Incidents in the Life of a Slave Girl, Written...","Jacobs, Harriet A. (Harriet Ann)",1813.0,1897.0,['en'],1604.0,"{'Jacobs, Harriet A. (Harriet Ann), 1813-1897'...",Text,,1861.0
57856,PG8775_text.txt,Poems,"Hugo, Victor",1802.0,1885.0,['en'],1596.0,"{'French poetry -- Translations into English',...",Text,,


In [236]:
df_curr.shape

(0, 13)

In [213]:
df_curr.timeperiod.value_counts()

5.0    14
4.0    13
3.0    12
0.0     7
2.0     7
1.0     3
Name: timeperiod, dtype: int64

In [214]:
df_curr.agecategory.value_counts()

2    23
1    21
3     8
4     2
0     2
Name: agecategory, dtype: int64

In [215]:
with open('clean_verbose.csv', 'a') as f:
    df_curr.to_csv(f, header=False)
df_curr.columns.tolist()

['id',
 'title',
 'author',
 'authoryearofbirth',
 'authoryearofdeath',
 'language',
 'downloads',
 'subjects',
 'type',
 'publicationyear',
 'timeperiod',
 'age',
 'agecategory']

In [216]:
df_curr.drop(['title',
 'author',
 'authoryearofbirth',
 'authoryearofdeath',
 'language',
 'downloads',
 'subjects',
 'type',
 'publicationyear',
 'age'], axis = 1, inplace = True)
df_curr.head()

Unnamed: 0,id,timeperiod,agecategory
17142,PG2542,4.0,3
18909,PG2701,3.0,1
7354,PG1661,5.0,1
17275,PG2554,4.0,2
10586,PG1952,5.0,1


In [218]:
with open('clean.csv', 'a') as f:
    df_curr.to_csv(f, header=False)