# Feature Engineering

In [7]:
import pickle
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import re
import json


### Text Cleaning

In [8]:
books = pd.read_csv('../data/booksummaries/data.csv')
books.head()
genres = []
for i in books['genre']:
    genres.append(list(json.loads(i).values()))
books['genre_new'] = genres

In [9]:
def clean_summary(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [10]:
books['clean_summary'] = books['summary'].apply(lambda x: clean_summary(x))
books[['summary','clean_summary']].head(2)

Unnamed: 0,summary,clean_summary
0,"Old Major, the old boar on the Manor Farm, ca...",old major the old boar on the manor farm calls...
1,"Alex, a teenager living in near-future Englan...",alex a teenager living in near future england ...


In [11]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

books['clean_summary'] = books['clean_summary'].apply(lambda x: remove_stopwords(x))

In [12]:
books['clean_summary'][0]

'old major old boar manor farm calls animals farm meeting compares humans parasites teaches animals revolutionary song beasts england major dies two young pigs snowball napoleon assume command turn dream philosophy animals revolt drive drunken irresponsible mr jones farm renaming animal farm adopt seven commandments animal ism important animals equal snowball attempts teach animals reading writing food plentiful farm runs smoothly pigs elevate positions leadership set aside special food items ostensibly personal health napoleon takes pups farm dogs trains privately napoleon snowball struggle leadership snowball announces plans build windmill napoleon dogs chase snowball away declares leader napoleon enacts changes governance structure farm replacing meetings committee pigs run farm using young pig named squealer mouthpiece napoleon claims credit windmill idea animals work harder promise easier lives windmill violent storm animals find windmill annihilated napoleon squealer convince ani

### Remove all labels that you can find less than 100 times

In [13]:
dict = {}
for index, row in books.iterrows():
    for x in row['genre_new']:
        if x in dict:
            dict[x] = dict[x] +1
        else:
            dict[x] = 1

dict

{'Roman à clef': 29,
 'Satire': 123,
 "Children's literature": 2122,
 'Speculative fiction': 4314,
 'Fiction': 4747,
 'Science Fiction': 2870,
 'Novella': 87,
 'Utopian and dystopian fiction': 67,
 'Existentialism': 8,
 'Absurdist fiction': 29,
 'Novel': 2463,
 'Hard science fiction': 25,
 'Fantasy': 2413,
 'War novel': 87,
 'Bildungsroman': 42,
 'Religious text': 4,
 'Picaresque novel': 18,
 'Gothic fiction': 112,
 'Horror': 511,
 'Invasion literature': 4,
 'Mystery': 1396,
 'Epistolary novel': 30,
 'Parody': 15,
 'Psychological novel': 26,
 'Farce': 4,
 'Philosophy': 47,
 'Science': 22,
 'Dystopia': 127,
 'Detective fiction': 341,
 'Suspense': 765,
 'Historical fiction': 388,
 'Adventure novel': 330,
 'Humour': 62,
 'Historical novel': 654,
 'Sea story': 4,
 'Cyberpunk': 16,
 'Business': 16,
 'Non-fiction': 230,
 'Economics': 13,
 'Anthropology': 4,
 'Sociology': 31,
 'Romance novel': 435,
 'Poetry': 28,
 'Chivalric romance': 15,
 'High fantasy': 66,
 'Time travel': 20,
 'Scientific 

In [14]:
def getGenresInMoreThan100Books(row):
    genres = []
    for x in row:
        if dict[x] > 100:
            genres.append(x)
    return genres

In [15]:
books['genre_new'] = books['genre_new'].apply(lambda x: getGenresInMoreThan100Books(x))
books[['genre_new']].head(10)

Unnamed: 0,genre_new
0,"[Satire, Children's literature, Speculative fi..."
1,"[Science Fiction, Speculative fiction, Satire,..."
2,"[Fiction, Novel]"
3,"[Science Fiction, Speculative fiction, Fantasy..."
4,[]
5,"[Children's literature, Fantasy, Speculative f..."
6,"[Science Fiction, Speculative fiction]"
7,"[Science Fiction, Speculative fiction]"
8,[]
9,"[Speculative fiction, Fiction, Novel]"


In [16]:
books.shape

(12841, 6)

In [17]:
books = books[books['genre_new'].map(lambda d: len(d)) > 0]
books.shape

(12243, 6)

### Label Encoding

In [18]:
books

Unnamed: 0,book_id,book_name,genre,summary,genre_new,clean_summary
0,620,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","[Satire, Children's literature, Speculative fi...",old major old boar manor farm calls animals fa...
1,843,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","[Science Fiction, Speculative fiction, Satire,...",alex teenager living near future england leads...
2,986,The Plague,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"[Fiction, Novel]",text plague divided five parts town oran thous...
3,2080,A Fire Upon the Deep,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,"[Science Fiction, Speculative fiction, Fantasy...",novel posits space around milky way divided co...
5,2890,A Wizard of Earthsea,"{""/m/0dwly"": ""Children's literature"", ""/m/01hm...","Ged is a young boy on Gont, one of the larger...","[Children's literature, Fantasy, Speculative f...",ged young boy gont one larger islands north ar...
...,...,...,...,...,...,...
12836,36372465,The Third Lynx,"{""/m/06n90"": ""Science Fiction""}",The story starts with former government agent...,[Science Fiction],story starts former government agent frank com...
12837,36534061,Remote Control,"{""/m/01jfsb"": ""Thriller"", ""/m/02xlf"": ""Fiction...",The series follows the character of Nick Ston...,"[Thriller, Fiction, Suspense]",series follows character nick stone ex militar...
12838,37054020,Transfer of Power,"{""/m/01jfsb"": ""Thriller"", ""/m/02xlf"": ""Fiction""}",The reader first meets Rapp while he is doing...,"[Thriller, Fiction]",reader first meets rapp covert operation iran ...
12839,37122323,Decoded,"{""/m/0xdf"": ""Autobiography""}",The book follows very rough chronological ord...,[Autobiography],book follows rough chronological order switchi...


### Split data

In [19]:
y = np.asarray(books['genre_new'])
X = books["clean_summary"]


In [20]:
with open('../Pickles/X.pickle', 'wb') as output:
    pickle.dump(X, output)
    
with open('../Pickles/y.pickle', 'wb') as output:
    pickle.dump(y, output)
    

In [21]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(y)

# transform target variable
y = multilabel_binarizer.transform(y)

In [22]:
with open('../Pickles/multilabel_binarizer.pickle', 'wb') as output:
    pickle.dump(multilabel_binarizer, output)  

In [23]:
y[0]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0])

In [24]:
# splitting the data to training and testing data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [25]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8570,)
(3673,)
(8570, 27)
(3673, 27)


In [26]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 1000

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(8570, 1000)
(3673, 1000)


In [27]:
# FEATURES
with open('../Pickles/features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

with open('../Pickles/features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)


# LABELS
with open('../Pickles/labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

with open('../Pickles/labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)


### ELMO :)

In [57]:
import tensorflow as tf
import tensorflow_hub as hub    


elmo = hub.load("https://tfhub.dev/google/elmo/3")
elmo.signatures["default"](tf.constant(["my sentence"]))

{'lstm_outputs2': <tf.Tensor 'StatefulPartitionedCall_4:3' shape=(1, 2, 1024) dtype=float32>,
 'lstm_outputs1': <tf.Tensor 'StatefulPartitionedCall_4:2' shape=(1, 2, 1024) dtype=float32>,
 'elmo': <tf.Tensor 'StatefulPartitionedCall_4:1' shape=(1, 2, 1024) dtype=float32>,
 'default': <tf.Tensor 'StatefulPartitionedCall_4:0' shape=(1, 1024) dtype=float32>,
 'sequence_len': <tf.Tensor 'StatefulPartitionedCall_4:4' shape=(1,) dtype=int32>,
 'word_emb': <tf.Tensor 'StatefulPartitionedCall_4:5' shape=(1, 2, 512) dtype=float32>}

In [67]:
def elmo_vectors(x):
  #embeddings = elmo.signatures["default"](x)
  embeddings = elmo(x, training=is_training)
  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

In [68]:
list_train = [X_train[i:i+100] for i in range(0,X_train.shape[0],100)]
list_test = [X_test[i:i+100] for i in range(0,X_test.shape[0],100)]

In [66]:
elmo_train = [elmo_vectors(X_train) for x in list_train]
elmo_test = [elmo_vectors(X_test) for x in list_test]


elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

TypeError: pruned(text): expected argument #0(zero-based) to be a Tensor; got Series (5514     doctor benny europe aftermath world war childr...
1086     book opens november hannay friend sandy conval...
2367     story told theseus looking back life vantage p...
5467     sir geoffrey peveril major bridgenorth boys to...
4869     pete garden protagonist one several residents ...
                               ...                        
12522    wither describes future genetic engineering cu...
5423     garden shadows starts tall plain olivia rescue...
5623     back story presented novel describes first con...
901      novel begins text letter dated july crimean wa...
7555     serge therapy coping fact turned first goes re...
Name: clean_summary, Length: 8570, dtype: object).