In [1]:
import sys; sys.path.append('../../src/helpers')
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from data_manipulation import data
import re

## Setup

In [91]:
books = data.loadAndClean('../../data/booksummaries/booksummaries.txt')

2019-06-04 12:05:38,909 data_manipulation.data INFO     Called loadAndClean function
2019-06-04 12:05:38,914 data_manipulation.data INFO     Read in booksummary data
2019-06-04 12:05:39,611 data_manipulation.data INFO     Drop any NA's present in dataset under book genre or plot summary
2019-06-04 12:05:40,399 data_manipulation.data INFO     returning booksummary data set


In [92]:
books = books[['bookGenre', 'plotSum', 'bookTitle']]

In [93]:
books.head(1)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, speculative_fiction, fiction]","Old Major, the old boar on the Manor Farm, calls the animals on the farm for a meeting, where he compares the humans to parasites and teaches the...",Animal Farm


In [97]:
books.shape[0]

12841

There are 12841 books in the data set with many genres and genre combinations. This notebook will attempt to trim down the genres to a small number (5-10) of unique categories, and possibly excise any books which don't fit into these categories.

In [95]:
corpus = list(books.bookGenre.apply(lambda row: row))
corpus[:1]

[['roman_à_clef',
  'satire',
  'childrens_literature',
  'speculative_fiction',
  'fiction']]

In [7]:
N = len(corpus)
N

12841

The corpus is the body of all of my documents, in this case a document is a list of strings which are the genre labels for each book. As you can see, the corpus has the same length as the # of books I have.

In [8]:
te = TransactionEncoder()
te_ary = te.fit(corpus).transform(corpus)
te_df = pd.DataFrame(te_ary, columns=te.columns_)

## Genre Consolidation

In [9]:
genres = list(te_df.columns)

In [10]:
len(genres)

227

The above cell shows that there are 227 unique genres which I need to pare down. The first step will be to standardize a lot of these, like turning zombies_in_pop_culture to zombie, and then the next step will be rolling things up into a smaller number of categories, like zombie --> horror.

In [29]:
books.bookGenre = corpus

In [90]:
books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle,Drop


In [22]:
te = TransactionEncoder()
te_ary = te.fit(corpus).transform(corpus)
te_df = pd.DataFrame(te_ary, columns=te.columns_)

In [23]:
frequent_itemsets = apriori(te_df, min_support=0.0001, use_colnames=True)
frequent_itemsets['Abs_support'] = frequent_itemsets.support * N

In [24]:
frequent_itemsets.head(2)

Unnamed: 0,support,itemsets,Abs_support
0,0.031462,(),404.0
1,0.002258,(absurdist_fiction),29.0


In [25]:
frequent_itemsets['num_of_genres'] = frequent_itemsets.itemsets.apply(lambda row: len([i for i in range(len(row))]))

In [26]:
frequent_itemsets.Abs_support.sum()

70997.0

Now we have a dataframe that we can see all the combinations and how often they occur. This will help us make the rules to classify our multi-genre labels into fewer genres. 

In [27]:
pd.set_option('display.max_colwidth', 150)
frequent_itemsets.sort_values(by='Abs_support', ascending=False)[frequent_itemsets.num_of_genres > 2].head(50)



  


Unnamed: 0,support,itemsets,Abs_support,num_of_genres
671,0.092205,"(science_fiction, speculative_fiction, fiction)",1184.0,3
605,0.078732,"(speculative_fiction, fiction, fantasy)",1011.0,3
561,0.052878,"(crime_fiction, fiction, suspense/thriller/spy)",679.0,3
629,0.045479,"(science_fiction, speculative_fiction, fantasy)",584.0,3
447,0.044778,"(speculative_fiction, fiction, children)",575.0,3
604,0.04034,"(science_fiction, fiction, fantasy)",518.0,3
1105,0.038315,"(science_fiction, speculative_fiction, fiction, fantasy)",492.0,4
435,0.036757,"(speculative_fiction, fantasy, children)",472.0,3
429,0.027724,"(fiction, fantasy, children)",356.0,3
903,0.025621,"(speculative_fiction, fiction, fantasy, children)",329.0,4
