In [1]:
import sys; sys.path.append('../custom_python_packages')
from data_manipulation import data

books = data.loadAndClean('../../data/booksummaries/booksummaries.txt')
books = books[['bookGenre', 'plotSum', 'bookTitle']]

In [2]:
books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","Old Major, the old boar on the Manor Farm, ca...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","Alex, a teenager living in near-future Englan...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...",The text of The Plague is divided into five p...,The Plague


In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import word_tokenize

#Custom Transformer that tokenizes
class Tokenizer( BaseEstimator, TransformerMixin ):
         
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, DF, y = None):
        DF_2 = DF.copy(deep=True)
        DF_2['plotSum'] = DF_2['plotSum'].apply(lambda row: word_tokenize(row))
        return DF_2

In [4]:
tokenizer = Tokenizer()
tokenized_books = tokenizer.transform(books)
tokenized_books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","[Old, Major, ,, the, old, boar, on, the, Manor...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","[Alex, ,, a, teenager, living, in, near-future...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...","[The, text, of, The, Plague, is, divided, into...",The Plague


In [5]:
books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","Old Major, the old boar on the Manor Farm, ca...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","Alex, a teenager living in near-future Englan...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...",The text of The Plague is divided into five p...,The Plague


In [6]:
from nltk.corpus import stopwords 

#Custom Transformer that filters a sentence
class Filter_sentence( BaseEstimator, TransformerMixin ):
        
    #Class Constructor 
    def __init__( self, names=False ):
        self.names = names        
        
        
    def filter_sentence(self, tokenized_sentence):
        filtered_sentence = [] 
        stop_words = set(stopwords.words('english')) 
        punctuation= "?:!.,;-`''()'s'``"

        for w in tokenized_sentence: 
            w = w.lower()
            if w not in stop_words:
                if w not in punctuation: 
                    filtered_sentence.append(w.replace('-','').strip()) 

        return filtered_sentence
        
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, DF, y = None ):
        DF_2 = DF.copy(deep=True)
        DF_2['plotSum'] = DF_2['plotSum'].apply(lambda row: self.filter_sentence(row))
        return DF_2

In [7]:
books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","Old Major, the old boar on the Manor Farm, ca...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","Alex, a teenager living in near-future Englan...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...",The text of The Plague is divided into five p...,The Plague


In [8]:
filterer_of_sentences = Filter_sentence()
tokened_and_filtered_summaries = filterer_of_sentences.transform(tokenized_books)
tokened_and_filtered_summaries.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","[old, major, old, boar, manor, farm, calls, an...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","[alex, teenager, living, nearfuture, england, ...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...","[text, plague, divided, five, parts, town, ora...",The Plague


In [9]:
from nltk import pos_tag

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#Custom Transformer that stems or lemmatizes
class StemmingLemming(BaseEstimator, TransformerMixin):
        
    #Class Constructor 
    def __init__(self, prune_type="Porter"):
        self.prune_type = prune_type        
        
    def get_wordnet_pos(self, word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN) 
    
    def stemLem_sentence(self, sentence):
        stemLemmed_sentence = []
        for word in sentence:
            if word:
                if self.prune_type == "Porter":
                    stemLemmed_sentence.append(PorterStemmer().stem(word))
                elif self.prune_type == "Lancaster":
                    stemLemmed_sentence.append(LancasterStemmer().stem(word))
                elif self.prune_type == "Lemmatization":
                    stemLemmed_sentence.append(WordNetLemmatizer().lemmatize(word, self.get_wordnet_pos(word)))
        return stemLemmed_sentence
        
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, DF, y = None):
        DF_2 = DF.copy(deep=True)
        DF_2['plotSum'] = DF_2['plotSum'].apply(lambda row: self.stemLem_sentence(row))
        return DF_2

In [10]:
tokened_and_filtered_summaries.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","[old, major, old, boar, manor, farm, calls, an...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","[alex, teenager, living, nearfuture, england, ...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...","[text, plague, divided, five, parts, town, ora...",The Plague


In [11]:
porter = StemmingLemming(prune_type="Porter")
portered_books = porter.transform(tokened_and_filtered_summaries)

In [12]:
portered_books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","[old, major, old, boar, manor, farm, call, ani...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","[alex, teenag, live, nearfutur, england, lead,...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...","[text, plagu, divid, five, part, town, oran, t...",The Plague


In [13]:
tokened_and_filtered_summaries.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","[old, major, old, boar, manor, farm, calls, an...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","[alex, teenager, living, nearfuture, england, ...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...","[text, plague, divided, five, parts, town, ora...",The Plague


In [14]:
lancaster = StemmingLemming(prune_type="Lancaster")
lancasted_books = lancaster.transform(tokened_and_filtered_summaries)

In [15]:
lancasted_books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","[old, maj, old, boar, man, farm, cal, anim, fa...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","[alex, teen, liv, nearfut, england, lead, gang...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...","[text, plagu, divid, fiv, part, town, or, thou...",The Plague


In [16]:
lemmer = StemmingLemming(prune_type="Lemmatization")
lemmatized_books = lemmer.transform(tokened_and_filtered_summaries)

In [17]:
lemmatized_books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","[old, major, old, boar, manor, farm, call, ani...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","[alex, teenager, living, nearfuture, england, ...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...","[text, plague, divide, five, part, town, oran,...",The Plague


In [18]:
books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","Old Major, the old boar on the Manor Farm, ca...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","Alex, a teenager living in near-future Englan...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...",The text of The Plague is divided into five p...,The Plague


In [22]:
#Custom Transformer that joins
class joiner( BaseEstimator, TransformerMixin ):
         
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, DF, y = None):
        DF_2 = DF.copy(deep=True)
        DF_2['plotSum'] = DF_2['plotSum'].apply(lambda row: ' '.join(row))
        return DF_2

In [24]:
joiner_transformer = joiner()
joined_books = joiner_transformer.transform(lemmatized_books)
joined_books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...",old major old boar manor farm call animal farm...,Animal Farm
1,"[science_fiction, novella, speculative_fiction...",alex teenager living nearfuture england lead g...,A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...",text plague divide five part town oran thousan...,The Plague
