In [2]:
import numpy as np
import pandas as pd
import glob

import matplotlib.pyplot as plt
%matplotlib inline

# PG Bookshelves
We use PG bookshelves as labels, see here [here](http://www.gutenberg.org/wiki/Category:Bookshelf).
They are better than the 'genre' metadata field. **WHY???**
## Getting the data
We scrap the PG wiki with `wget`,
```bash
wget --random-wait -r -p --no-parent -e robots=off -U mozilla http://www.gutenberg.org/wiki/Category:Bookshelf
```
see `data/bookshelves/README.md` for details. Then we parse the html files with `lxml`

In [3]:
import lxml.html

In [4]:
BS_paths = glob.glob("../data/bookshelves/*Bookshelf*")
BS = [path.split("/")[-1] for path in BS_paths]

In [6]:
BS_dict = {}
for path in BS_paths:
    bs = path.split("/")[-1]
    BS_dict[bs] = []
    with open(path, "r") as foo:
        dom = lxml.html.fromstring(foo.read())
        # select the url in href for all a tags(links)
        for link in dom.xpath('//a/@href'):
            # links to ebooks that are not searches
            if link.find("ebooks")>-1 and link.find("search")==-1:
                PGid = "PG"+link.split("/")[-1]
                BS_dict[bs].append(PGid)

    # delete empty BSs
    if len(BS_dict[bs])==0:
        del BS_dict[bs]
    
# recompose list of BSs
BS = list(BS_dict.keys())

# list of unique PGids
PGids = list(set(np.concatenate(list(BS_dict.values()))))

And save the data into a pickled dataframe

In [7]:
# put in a DataFrame
df = pd.DataFrame(index = PGids, columns = BS)
for k,v in BS_dict.items():
    df.loc[v, k] = True

In [8]:
df.to_pickle("../data/bookshelves_raw.p")

## Cleaning the data
Bookshelves are almost non-overlapping

In [36]:
from collections import Counter

In [78]:
def get_intersections_df(df, min_books=100):
    # Bookshelves with at least 100 books
    sdf = df.loc[:, df.sum()>min_books]
    sdf = sdf.loc[sdf.sum(axis=1).dropna().index]

    # overlaps are small
    ratios = []
    from itertools import combinations
    for s1, s2 in combinations(sdf.columns, 2):
        l1 = len(BS_dict[s1])
        l2 = len(BS_dict[s2])
        l3 = len(np.intersect1d(BS_dict[s1], BS_dict[s2]))
        intratio = l3/min(l1, l2)
        ratios.append([intratio, s1, s2])
    intersetions_df = pd.DataFrame(data=ratios, columns = ["intersection", "bs1", "bs2"])
    return intersetions_df

In [79]:
intersetions_df = get_intersections_df(df)

In [80]:
(intersetions_df.intersection<0.05).mean()

0.9661733615221987

That is, with probability 98% two BS have less than 5% of their books in common

In [81]:
def get_most_overlapping_BS(intersetions_df):
    notempty = intersetions_df.loc[intersetions_df.intersection!=0]
    most_overlaped = Counter(list(notempty.bs1.values)+list(notempty.bs2.values))
    return most_overlaped.most_common()

In [91]:
def drop_most_overlapping(df):
    intersetions_df = get_intersections_df(df)
    todrop = get_most_overlapping_BS(intersetions_df)[0][0]
    print("Dropping %s" %todrop)
    sdf = df.drop(df[todrop].dropna().index)
    sdf = sdf.dropna(how="all", axis=1)
    intersetions_df = get_intersections_df(sdf)
    return sdf, intersetions_df

In [96]:
sdf, intersetions_df = drop_most_overlapping(df)
for _ in range(7):
    sdf, intersetions_df = drop_most_overlapping(sdf)

Dropping Bestsellers,_American,_1895-1923_(Bookshelf)
Dropping Best_Books_Ever_Listings_(Bookshelf)
Dropping Harvard_Classics_(Bookshelf)
Dropping Children's_History_(Bookshelf)
Dropping Children's_Literature_(Bookshelf)
Dropping Banned_Books_(Bookshelf)
Dropping Technology_(Bookshelf)
Dropping Christmas_(Bookshelf)


In [97]:
intersetions_df.sort_values(by="intersection", ascending=False)

Unnamed: 0,intersection,bs1,bs2
234,0.939655,Animal_(Bookshelf),Animals-Wild_(Bookshelf)-Mammals
67,0.076696,Historical_Fiction_(Bookshelf),Children's_Fiction_(Bookshelf)
322,0.058282,Children's_Fiction_(Bookshelf),US_Civil_War_(Bookshelf)
135,0.046154,Australia_(Bookshelf),World_War_I_(Bookshelf)
266,0.035264,World_War_I_(Bookshelf),Children's_Book_Series_(Bookshelf)
30,0.029762,Humor_(Bookshelf),Children's_Picture_Books_(Bookshelf)
354,0.027027,Short_Stories_(Bookshelf)_Authors_A-E,Detective_Fiction_(Bookshelf)
183,0.023438,Mathematics_(Bookshelf),Philosophy_(Bookshelf)
326,0.021739,Children's_Fiction_(Bookshelf),Native_America_(Bookshelf)
355,0.021277,Short_Stories_(Bookshelf)_Authors_A-E,US_Civil_War_(Bookshelf)
