# Feature Engineering

In [142]:
import pickle
import pandas as pd
from stop_words import safe_get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np
from nltk.corpus import stopwords
import re
import json
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

### Text Cleaning

In [111]:
books = pd.read_csv('../data/booksummaries/data.csv')
books.head()
genres = []
for i in books['genre']:
    genres.append(list(json.loads(i).values()))
books['genre_new'] = genres

In [112]:
def clean_summary(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [113]:
books['clean_summary'] = books['summary'].apply(lambda x: clean_summary(x))
books.head(2)

Unnamed: 0,book_id,book_name,genre,summary,genre_new,clean_summary
0,620,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","[Roman à clef, Satire, Children's literature, ...",old major the old boar on the manor farm calls...
1,843,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","[Science Fiction, Novella, Speculative fiction...",alex a teenager living in near future england ...


In [114]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

books['clean_summary'] = books['clean_summary'].apply(lambda x: remove_stopwords(x))

In [115]:
books['clean_summary'][0]

'old major old boar manor farm calls animals farm meeting compares humans parasites teaches animals revolutionary song beasts england major dies two young pigs snowball napoleon assume command turn dream philosophy animals revolt drive drunken irresponsible mr jones farm renaming animal farm adopt seven commandments animal ism important animals equal snowball attempts teach animals reading writing food plentiful farm runs smoothly pigs elevate positions leadership set aside special food items ostensibly personal health napoleon takes pups farm dogs trains privately napoleon snowball struggle leadership snowball announces plans build windmill napoleon dogs chase snowball away declares leader napoleon enacts changes governance structure farm replacing meetings committee pigs run farm using young pig named squealer mouthpiece napoleon claims credit windmill idea animals work harder promise easier lives windmill violent storm animals find windmill annihilated napoleon squealer convince ani

### Remove all labels that you can find less than 100 times

In [116]:
dict = {}
for index, row in books.iterrows():
    for x in row['genre_new']:
        if x in dict:
            dict[x] = dict[x] +1
        else:
            dict[x] = 1

dict.get("Fiction")
books.head()

Unnamed: 0,book_id,book_name,genre,summary,genre_new,clean_summary
0,620,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","[Roman à clef, Satire, Children's literature, ...",old major old boar manor farm calls animals fa...
1,843,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","[Science Fiction, Novella, Speculative fiction...",alex teenager living near future england leads...
2,986,The Plague,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"[Existentialism, Fiction, Absurdist fiction, N...",text plague divided five parts town oran thous...
3,2080,A Fire Upon the Deep,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,"[Hard science fiction, Science Fiction, Specul...",novel posits space around milky way divided co...
4,2152,All Quiet on the Western Front,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...","[War novel, Roman à clef]",book tells story paul b umer german soldier ur...


In [59]:
def getGenresInMoreThan100Books(row):
    genres = []
    for x in row:
        if dict[x] > 100:
            genres.append(x)
    return genres

In [117]:
books['genre_new'] = books['genre_new'].apply(lambda x: getGenresInMoreThan100Books(x))
books.head()

Roman à clef
Novella
Utopian and dystopian fiction
Existentialism
Absurdist fiction
Hard science fiction
War novel
Roman à clef
Bildungsroman
Religious text
Bildungsroman
Picaresque novel
Invasion literature
Epistolary novel
Parody
Psychological novel
Farce
Picaresque novel
Philosophy
Science
Religious text
Absurdist fiction
Absurdist fiction
Novella
Utopian and dystopian fiction
Religious text
Novella
Roman à clef
Humour
Epistolary novel
Sea story
Cyberpunk
Business
Economics
Anthropology
Sociology
Psychological novel
Roman à clef
Absurdist fiction
Hard science fiction
Poetry
Chivalric romance
High fantasy
Philosophy
Time travel
Scientific romance
Juvenile fantasy
Religion
Epistolary novel
Inspirational
Humour
Short story
Techno-thriller
Science
Poetry
Drama
Absurdist fiction
Play
Utopian and dystopian fiction
Poetry
Poetry
Albino bias
Science
Scientific romance
Scientific romance
Computer Science
Existentialism
Reference
Poetry
High fantasy
Drama
Supernatural
Epistolary novel
Literar

Unnamed: 0,book_id,book_name,genre,summary,genre_new,clean_summary
0,620,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","[Satire, Children's literature, Speculative fi...",old major old boar manor farm calls animals fa...
1,843,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","[Science Fiction, Speculative fiction, Satire,...",alex teenager living near future england leads...
2,986,The Plague,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"[Fiction, Novel]",text plague divided five parts town oran thous...
3,2080,A Fire Upon the Deep,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,"[Science Fiction, Speculative fiction, Fantasy...",novel posits space around milky way divided co...
4,2152,All Quiet on the Western Front,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...",[],book tells story paul b umer german soldier ur...


In [134]:
books['most_common_genres'] = books['genre_new'].apply(lambda x: json.dumps(x))
books.head()

Unnamed: 0,book_id,book_name,genre,summary,genre_new,clean_summary,most_common_genres
0,620,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","[Satire, Children's literature, Speculative fi...",old major old boar manor farm calls animals fa...,"[""Satire"", ""Children's literature"", ""Speculati..."
1,843,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","[Science Fiction, Speculative fiction, Satire,...",alex teenager living near future england leads...,"[""Science Fiction"", ""Speculative fiction"", ""Sa..."
2,986,The Plague,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"[Fiction, Novel]",text plague divided five parts town oran thous...,"[""Fiction"", ""Novel""]"
3,2080,A Fire Upon the Deep,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,"[Science Fiction, Speculative fiction, Fantasy...",novel posits space around milky way divided co...,"[""Science Fiction"", ""Speculative fiction"", ""Fa..."
4,2152,All Quiet on the Western Front,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...",[],book tells story paul b umer german soldier ur...,[]


### Label Encoding

### Split data

In [165]:
books.head()
#books.head()
#df2 = (books.drop(columns='genre_new').join(books['genre_new'].str.join('|').str.get_dummies()))
#xtrain, xval, ytrain, yval = train_test_split(books['clean_summary'], np.array(books['most_common_genres']), test_size=0.2, random_state=557)
#type(yval[0])
#df2.head()

Unnamed: 0,book_id,book_name,genre,summary,genre_new,clean_summary,most_common_genres,genre_new_test
0,620,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","[Satire, Children's literature, Speculative fi...",old major old boar manor farm calls animals fa...,"[[, "", S, a, t, i, r, e, "", ,, , "", C, h, i, ...","[Satire, Children's literature, Speculative fi..."
1,843,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","[Science Fiction, Speculative fiction, Satire,...",alex teenager living near future england leads...,"[[, "", S, c, i, e, n, c, e, , F, i, c, t, i, ...","[Science Fiction, Speculative fiction, Satire,..."
2,986,The Plague,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"[Fiction, Novel]",text plague divided five parts town oran thous...,"[[, "", F, i, c, t, i, o, n, "", ,, , "", N, o, ...","[Fiction, Novel]"
3,2080,A Fire Upon the Deep,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,"[Science Fiction, Speculative fiction, Fantasy...",novel posits space around milky way divided co...,"[[, "", S, c, i, e, n, c, e, , F, i, c, t, i, ...","[Science Fiction, Speculative fiction, Fantasy..."
4,2152,All Quiet on the Western Front,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...",[],book tells story paul b umer german soldier ur...,"[[, ]]",[]


In [146]:
y = np.asarray(df2[df2.columns[7:]])
X = df2["clean_summary"]
# initializing TfidfVectorizer 
  
# splitting the data to training and testing data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
  
# transforming the data
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

xtrain_tfidf = tfidf_vectorizer.fit_transform(X_train.values.astype('U'))

xval_tfidf = tfidf_vectorizer.transform(X_test.values.astype('U'))




In [158]:
y_test[11]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0])

In [147]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

# fit model on train data.
clf.fit(xtrain_tfidf, y_train)

# make predictions for validation set.
y_pred_lr = clf.predict(xval_tfidf)


from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
  

#Calculating the accuracy.
print( 'Accuracy Score :',accuracy_score(y_test,y_pred_lr) )

#Printing the classification report.
print ('Report : ')
print(classification_report(y_test,y_pred_lr))


Accuracy Score : 0.13080716324941605
Report : 
              precision    recall  f1-score   support

           0       0.50      0.02      0.03        63
           1       0.00      0.00      0.00        37
           2       0.00      0.00      0.00        34
           3       0.82      0.18      0.30       624
           4       0.00      0.00      0.00        45
           5       0.00      0.00      0.00        25
           6       0.88      0.16      0.27       225
           7       0.89      0.07      0.13       112
           8       0.00      0.00      0.00        38
           9       0.90      0.44      0.59       713
          10       0.55      0.26      0.36      1418
          11       0.00      0.00      0.00        30
          12       0.00      0.00      0.00       120
          13       1.00      0.01      0.03       209
          14       1.00      0.08      0.15       137
          15       0.86      0.23      0.37       432
          16       0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
