In [112]:
import numpy as np
import sys; sys.path.append('../../src/helpers')
import pandas as pd
from data_manipulation import custom_transformers as ct
from sklearn.preprocessing import MultiLabelBinarizer as MLB
import ast
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score
import csv

In [2]:
books = pd.read_csv('../../data/booksummaries/booksummaries_ready2train.csv')

In [3]:
books = books[['bookGenre', 'plotSum2vec', 'plotSumLDA']]

In [4]:
books.head(2)

Unnamed: 0,bookGenre,plotSum2vec,plotSumLDA
0,"['comedy', 'children', 'speculative_fiction', ...","[0.200577126220817, -0.07933575619857114, -0.1...","[[0.10461494554710699, 0.008884378461627089, 0..."
1,"['science_fiction', 'fiction', 'speculative_fi...","[0.18350660521234127, 0.06064006807343155, -0....","[[0.28507211845525116, 0.02644368505441909, 0...."


In [5]:
genres = books['bookGenre'].to_list()

In [6]:
genres_list = [ast.literal_eval(x) for x in genres]

In [67]:
genres_list[0]

['comedy', 'children', 'speculative_fiction', 'fiction']

In [8]:
mlb = MLB()

In [9]:
mlb_Genres = mlb.fit_transform(y=genres_list)

In [10]:
Y = pd.DataFrame(mlb_Genres, columns=mlb.classes_)

In [11]:
Y.head(3)

Unnamed: 0,absurdist_fiction,adventure,anthol_biog_autobiog,anthropology,anti-nuclear,anti-war,chick_lit,children,comedy,conspiracy,...,science_fiction,speculative_fiction,sports,steampunk,suspense/thriller/spy,tragicomedy,true_crime,war,western,young_adult
0,0,0,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,1,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [110]:
category_names = list(Y.columns)

In [119]:
with open('category_names.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(category_names)

In [126]:
with open('category_names.csv', 'r') as csvFile:
    reader = csv.reader(csvFile)
    category_names_list = list(reader)
category_names_list = category_names_list[0]
category_names_list

['absurdist_fiction',
 'adventure',
 'anthol_biog_autobiog',
 'anthropology',
 'anti-nuclear',
 'anti-war',
 'chick_lit',
 'children',
 'comedy',
 'conspiracy',
 'conspiracy_fiction',
 'cookbook',
 'creative_nonfiction',
 'crime_fiction',
 'drama',
 'existential_philosophy',
 'fantasy',
 'fiction',
 'gay_themed',
 'historical_fiction',
 'horror',
 'indian_chick_lit',
 'informational',
 'lgbt',
 'non_fiction_lit',
 'popular_culture',
 'pornography',
 'realistic_fiction',
 'religious',
 'romance',
 'science_fiction',
 'speculative_fiction',
 'sports',
 'steampunk',
 'suspense/thriller/spy',
 'tragicomedy',
 'true_crime',
 'war',
 'western',
 'young_adult']

In [115]:
pd.read_csv('category_names.csv')

Unnamed: 0,absurdist_fiction,adventure,anthol_biog_autobiog,anthropology,anti-nuclear,anti-war,chick_lit,children,comedy,conspiracy,...,science_fiction,speculative_fiction,sports,steampunk,suspense/thriller/spy,tragicomedy,true_crime,war,western,young_adult


In [12]:
books.head(2)

Unnamed: 0,bookGenre,plotSum2vec,plotSumLDA
0,"['comedy', 'children', 'speculative_fiction', ...","[0.200577126220817, -0.07933575619857114, -0.1...","[[0.10461494554710699, 0.008884378461627089, 0..."
1,"['science_fiction', 'fiction', 'speculative_fi...","[0.18350660521234127, 0.06064006807343155, -0....","[[0.28507211845525116, 0.02644368505441909, 0...."


In [13]:
plotSumVecs = books['plotSum2vec']

In [14]:
plotSumVecs_list = [ast.literal_eval(x) for x in plotSumVecs]

In [15]:
plotSumVecs_DF = pd.DataFrame(data=plotSumVecs_list)

In [16]:
plotSumVecs_DF.shape

(12671, 50)

In [17]:
plotSumLDA_vecs = books['plotSumLDA']

In [18]:
plotSumLDA_list = [ast.literal_eval(x) for x in plotSumLDA_vecs]

In [19]:
plotSumLDA_list = [innerList[0] for innerList in plotSumLDA_list]

In [20]:
plotSumLDA_DF = pd.DataFrame(data=plotSumLDA_list)

In [35]:
newColsWord2vec = []
for columnIndex in range(len(plotSumVecs_DF.columns)):
    newColsWord2vec.append('word2vec_'+str(columnIndex+1))
plotSumVecs_DF.columns = newColsWord2vec

plotSumVecs_DF.head(3)

Unnamed: 0,word2vec_1,word2vec_2,word2vec_3,word2vec_4,word2vec_5,word2vec_6,word2vec_7,word2vec_8,word2vec_9,word2vec_10,...,word2vec_41,word2vec_42,word2vec_43,word2vec_44,word2vec_45,word2vec_46,word2vec_47,word2vec_48,word2vec_49,word2vec_50
0,0.200577,-0.079336,-0.147509,-0.320832,0.305592,0.252995,-0.167397,-0.163471,0.100141,-0.014162,...,-0.211084,-0.093839,0.060648,0.248629,0.178152,0.060955,-0.150804,-0.203166,-0.022193,-0.015207
1,0.183507,0.06064,-0.132822,-0.262741,0.23078,0.184257,-0.19481,0.113546,-0.122146,0.155339,...,-0.155676,0.094734,-0.066658,0.097976,0.218821,-0.048189,-0.050164,-0.193672,-0.010194,-0.013911
2,0.437469,0.004273,-0.097058,-0.315284,0.176915,0.101175,-0.154972,0.181951,0.03247,-0.025964,...,-0.280644,0.048311,0.112386,0.077253,0.133229,0.058216,-0.131414,-0.136893,0.014812,-0.055796


In [72]:
newColsLDA = []
for columnIndex in range(len(plotSumLDA_DF.columns)):
    newColsLDA.append('LDA_'+str(columnIndex+1))
plotSumLDA_DF.columns = newColsLDA

plotSumLDA_DF.head(3)

Unnamed: 0,LDA_1,LDA_2,LDA_3,LDA_4,LDA_5,LDA_6,LDA_7,LDA_8,LDA_9,LDA_10,LDA_11,LDA_12,LDA_13,LDA_14,LDA_15,LDA_16,LDA_17,LDA_18,LDA_19,LDA_20
0,0.104615,0.008884,0.000105,0.039394,0.000105,0.000105,0.000105,0.000105,0.222072,0.000105,0.028772,0.000105,0.000105,0.153764,0.000105,0.105744,0.06467,0.270936,0.000105,0.000105
1,0.285072,0.026444,0.03129,0.000119,0.000119,0.000119,0.025189,0.000119,0.280084,0.000119,0.229332,0.000119,0.000119,0.000119,0.000119,0.121041,0.000119,0.000119,0.000119,0.000119
2,9.4e-05,0.211537,0.059044,0.011153,9.4e-05,9.4e-05,9.4e-05,0.005053,0.179821,9.4e-05,0.223022,9.4e-05,9.4e-05,9.4e-05,9.4e-05,0.097433,0.211813,9.4e-05,9.4e-05,9.4e-05


In [40]:
X = pd.concat([plotSumLDA_DF,plotSumVecs_DF], axis=1)

In [41]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [103]:
rfc = OneVsRestClassifier(RandomForestClassifier(bootstrap=True, random_state=541, n_estimators=100, max_features='sqrt'))
#rfc.fit(X_train, Y_train)
#Y_pred = rfc.predict(X_test)

In [104]:
categories = Y.columns
for category in categories:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    rfc.fit(X_train, Y_train[category])
    
    # calculating test accuracy
    prediction = rfc.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(Y_test[category], prediction)))
    print('F1 score is {}'.format(f1_score(Y_test[category], prediction)))
    print('Recall score is {}'.format(recall_score(Y_test[category], prediction)))
    print("\n")

**Processing absurdist_fiction comments...**
Test accuracy is 0.9977904040404041
F1 score is 0.0
Recall score is 0.0


**Processing adventure comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9750631313131313
F1 score is 0.0
Recall score is 0.0


**Processing anthol_biog_autobiog comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9775883838383839
F1 score is 0.0273972602739726
Recall score is 0.013888888888888888


**Processing anthropology comments...**
Test accuracy is 1.0
F1 score is 0.0
Recall score is 0.0


**Processing anti-nuclear comments...**
Test accuracy is 1.0
F1 score is 0.0
Recall score is 0.0


**Processing anti-war comments...**
Test accuracy is 1.0
F1 score is 0.0
Recall score is 0.0


**Processing chick_lit comments...**
Test accuracy is 0.9990530303030303
F1 score is 0.0
Recall score is 0.0


**Processing children comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.8519570707070707
F1 score is 0.2883156297420334
Recall score is 0.17790262172284643


**Processing comedy comments...**
Test accuracy is 0.9756944444444444
F1 score is 0.02531645569620253
Recall score is 0.01282051282051282


**Processing conspiracy comments...**
Test accuracy is 1.0
F1 score is 0.0
Recall score is 0.0


**Processing conspiracy_fiction comments...**
Test accuracy is 0.9993686868686869
F1 score is 0.0
Recall score is 0.0


**Processing cookbook comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9996843434343434
F1 score is 0.0
Recall score is 0.0


**Processing creative_nonfiction comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 1.0
F1 score is 0.0
Recall score is 0.0


**Processing crime_fiction comments...**
Test accuracy is 0.9037247474747475
F1 score is 0.5906040268456375
Recall score is 0.4772234273318872


**Processing drama comments...**
Test accuracy is 0.9971590909090909
F1 score is 0.0
Recall score is 0.0


**Processing existential_philosophy comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9965277777777778
F1 score is 0.0
Recall score is 0.0


**Processing fantasy comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.8753156565656566
F1 score is 0.6153846153846154
Recall score is 0.48244274809160304


**Processing fiction comments...**
Test accuracy is 0.6521464646464646
F1 score is 0.6862186788154897
Recall score is 0.7034442498540572


**Processing gay_themed comments...**
Test accuracy is 1.0
F1 score is 0.0
Recall score is 0.0


**Processing historical_fiction comments...**
Test accuracy is 0.9715909090909091
F1 score is 0.0
Recall score is 0.0


**Processing horror comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9548611111111112
F1 score is 0.14371257485029942
Recall score is 0.07741935483870968


**Processing indian_chick_lit comments...**
Test accuracy is 1.0
F1 score is 0.0
Recall score is 0.0


**Processing informational comments...**
Test accuracy is 0.9564393939393939
F1 score is 0.028169014084507043
Recall score is 0.014285714285714285


**Processing lgbt comments...**
Test accuracy is 1.0
F1 score is 0.0
Recall score is 0.0


**Processing non_fiction_lit comments...**
Test accuracy is 0.9861111111111112
F1 score is 0.2903225806451613
Recall score is 0.17307692307692307


**Processing popular_culture comments...**
Test accuracy is 1.0
F1 score is 0.0
Recall score is 0.0


**Processing pornography comments...**
Test accuracy is 1.0
F1 score is 0.0
Recall score is 0.0


**Processing realistic_fiction comments...**
Test accuracy is 0.9990530303030303
F1 score is 0.0
Recall score is 0.0


**Processing religious comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9981060606060606
F1 score is 0.0
Recall score is 0.0


**Processing romance comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9917929292929293
F1 score is 0.0
Recall score is 0.0


**Processing science_fiction comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.865530303030303
F1 score is 0.6586538461538461
Recall score is 0.5561569688768606


**Processing speculative_fiction comments...**
Test accuracy is 0.7575757575757576
F1 score is 0.5862068965517242
Recall score is 0.5088868101028999


**Processing sports comments...**
Test accuracy is 0.9987373737373737
F1 score is 0.0
Recall score is 0.0


**Processing steampunk comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9946338383838383
F1 score is 0.0
Recall score is 0.0


**Processing suspense/thriller/spy comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.8977272727272727
F1 score is 0.1336898395721925
Recall score is 0.0748502994011976


**Processing tragicomedy comments...**
Test accuracy is 0.9990530303030303
F1 score is 0.0
Recall score is 0.0


**Processing true_crime comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9974747474747475
F1 score is 0.0
Recall score is 0.0


**Processing war comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9993686868686869
F1 score is 0.0
Recall score is 0.0


**Processing western comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9949494949494949
F1 score is 0.0
Recall score is 0.0


**Processing young_adult comments...**


  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9232954545454546
F1 score is 0.00816326530612245
Recall score is 0.00411522633744856




In [80]:
np.set_printoptions(threshold=sys.maxsize)

In [83]:
Y_pred.shape

(3168, 40)

In [69]:
genres = []
for genreList in genres_list:
    for genre in genreList:
        genres.append(genre)
genres = set(genres)
genres

{'absurdist_fiction',
 'adventure',
 'anthol_biog_autobiog',
 'anthropology',
 'anti-nuclear',
 'anti-war',
 'chick_lit',
 'children',
 'comedy',
 'conspiracy',
 'conspiracy_fiction',
 'cookbook',
 'creative_nonfiction',
 'crime_fiction',
 'drama',
 'existential_philosophy',
 'fantasy',
 'fiction',
 'gay_themed',
 'historical_fiction',
 'horror',
 'indian_chick_lit',
 'informational',
 'lgbt',
 'non_fiction_lit',
 'popular_culture',
 'pornography',
 'realistic_fiction',
 'religious',
 'romance',
 'science_fiction',
 'speculative_fiction',
 'sports',
 'steampunk',
 'suspense/thriller/spy',
 'tragicomedy',
 'true_crime',
 'war',
 'western',
 'young_adult'}