In [20]:
import sys; sys.path.append('../../src/helpers')
from data_manipulation import data
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd
import re

## Setup

In [2]:
books = data.loadAndClean('../../data/booksummaries/booksummaries.txt')

2019-06-12 22:04:31,687 data_manipulation.data INFO     Called loadAndClean function
2019-06-12 22:04:31,693 data_manipulation.data INFO     Read in booksummary data
2019-06-12 22:04:32,427 data_manipulation.data INFO     Drop any NA's present in dataset under book genre or plot summary
2019-06-12 22:04:33,044 data_manipulation.data INFO     returning booksummary data set


In [4]:
books = books[['bookGenre', 'plotSum']]

In [5]:
books.head(1)

Unnamed: 0,bookGenre,plotSum
0,"[roman_à_clef, satire, childrens_literature, s...","Old Major, the old boar on the Manor Farm, ca..."


In [6]:
books.shape[0]

12841

There are 12841 books in the data set with many genres and genre combinations. This notebook will attempt to trim down the genres to a small number (5-10) of unique categories, and possibly excise any books which don't fit into these categories.

In [7]:
corpus = list(books.bookGenre.apply(lambda row: row))
corpus[:1]

[['roman_à_clef',
  'satire',
  'childrens_literature',
  'speculative_fiction',
  'fiction']]

In [8]:
N = len(corpus)
N

12841

The corpus is the body of all of my documents, in this case a document is a list of strings which are the genre labels for each book. As you can see, the corpus has the same length as the # of books I have.

In [9]:
te = TransactionEncoder()
te_ary = te.fit(corpus).transform(corpus)
te_df = pd.DataFrame(te_ary, columns=te.columns_)

## Genre Consolidation

In [10]:
genres = list(te_df.columns)

In [11]:
len(genres)

227

The above cell shows that there are 227 unique genres which I need to pare down. The first step will be to standardize a lot of these, like turning zombies_in_pop_culture to zombie, and then the next step will be rolling things up into a smaller number of categories, like zombie --> horror.

In [12]:
books.bookGenre = corpus

In [13]:
books.head(3)

Unnamed: 0,bookGenre,plotSum
0,"[roman_à_clef, satire, childrens_literature, s...","Old Major, the old boar on the Manor Farm, ca..."
1,"[science_fiction, novella, speculative_fiction...","Alex, a teenager living in near-future Englan..."
2,"[existentialism, fiction, absurdist_fiction, n...",The text of The Plague is divided into five p...


In [14]:
te = TransactionEncoder()
te_ary = te.fit(corpus).transform(corpus)
te_df = pd.DataFrame(te_ary, columns=te.columns_)

In [15]:
frequent_itemsets = apriori(te_df, min_support=0.0001, use_colnames=True)
frequent_itemsets['Abs_support'] = frequent_itemsets.support * N

In [16]:
frequent_itemsets.head(2)

Unnamed: 0,support,itemsets,Abs_support
0,0.002258,(absurdist_fiction),29.0
1,0.001791,(adventure),23.0


In [17]:
frequent_itemsets['num_of_genres'] = frequent_itemsets.itemsets.apply(lambda row: len([i for i in range(len(row))]))

In [18]:
frequent_itemsets.Abs_support.sum()

100241.0

Now we have a dataframe that we can see all the combinations and how often they occur. This will help us make the rules to classify our multi-genre labels into fewer genres. 

In [19]:
pd.set_option('display.max_colwidth', 150)
frequent_itemsets.sort_values(by='Abs_support', ascending=False)[frequent_itemsets.num_of_genres > 2].head(50)

  


Unnamed: 0,support,itemsets,Abs_support,num_of_genres
2614,0.077876,"(fiction, speculative_fiction, science_fiction)",1000.0,3
2216,0.073748,"(fiction, speculative_fiction, fantasy)",947.0,3
2514,0.048049,"(mystery, fiction, suspense)",617.0,3
1727,0.043455,"(childrens_literature, fiction, speculative_fiction)",558.0,3
2335,0.043377,"(fantasy, speculative_fiction, science_fiction)",557.0,3
2212,0.039171,"(fiction, fantasy, science_fiction)",503.0,3
4094,0.036835,"(fiction, speculative_fiction, fantasy, science_fiction)",473.0,4
1683,0.036212,"(childrens_literature, fantasy, speculative_fiction)",465.0,3
1664,0.026556,"(childrens_literature, fiction, fantasy)",341.0,3
3485,0.024609,"(childrens_literature, fiction, speculative_fiction, fantasy)",316.0,4
