In [1]:
from network import CNetwork

In [2]:
import pandas as pd
import numpy as np

In [3]:
from utils import verifyDir

In [4]:
name_dataset = "dataset_1"
dataset_path = "datasetsv2/"
length_cut = 30000
random_flag = True
measures = ["sp"]
feature_selection = 'common_words'

In [5]:
auxiliar_path = 'auxiliar_folder/' + name_dataset   + '/'
verifyDir(auxiliar_path)

In [6]:
df = pd.read_csv(dataset_path + name_dataset + ".csv")

In [7]:
df.head(5)

Unnamed: 0,label,text,book
0,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Christine Gehring,...",Right Ho Jeeves
1,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Suzanne L. Shell, ...",Tales Of St Austin
2,Joseph Conrad,And that last\n\nword was the single word of ...,Victory
3,Joseph Conrad,\n\n\n\nIf I have ever had these gifts in any ...,Under Western Eyes
4,Bram Stoker,\n\n\n\n Lond...,The Lady Of The Shroud


In [8]:
from utils.text_processing import get_min_len_corpus

In [9]:
print("Min Length:", get_min_len_corpus(list(df["text"])))

Min Length: 55024


In [10]:
from utils.text_processing import get_corpus, get_random_corpus

In [11]:
texts = list(df['text'])

In [12]:
corpus, segmented_corpus = get_corpus(texts, length_cut)

In [13]:
selected_corpus, words_features, word_index, index_word = get_random_corpus(segmented_corpus, remove_punctuation=True, lemmatization_flag=True, feature_selection = feature_selection)

In [14]:
len(words_features)

157

In [15]:
words_features

{'a': 0,
 'go': 1,
 'together': 2,
 'from': 3,
 'very': 4,
 'as': 5,
 'is': 6,
 'might': 7,
 'last': 8,
 'course': 9,
 'over': 10,
 'work': 11,
 'off': 12,
 'two': 13,
 'between': 14,
 'look': 15,
 'man': 16,
 'then': 17,
 'said': 18,
 'brought': 19,
 'both': 20,
 'but': 21,
 'the': 22,
 'like': 23,
 'did': 24,
 'when': 25,
 'open': 26,
 'down': 27,
 'always': 28,
 'well': 29,
 'here': 30,
 'we': 31,
 'they': 32,
 'on': 33,
 'one': 34,
 'more': 35,
 'that': 36,
 'time': 37,
 'many': 38,
 'see': 39,
 'if': 40,
 'out': 41,
 'my': 42,
 'place': 43,
 'it': 44,
 'long': 45,
 'say': 46,
 'even': 47,
 'after': 48,
 'have': 49,
 'of': 50,
 'may': 51,
 'too': 52,
 'head': 53,
 'these': 54,
 'i': 55,
 'made': 56,
 'just': 57,
 'hand': 58,
 'can': 59,
 'up': 60,
 'some': 61,
 'could': 62,
 'much': 63,
 'there': 64,
 'their': 65,
 'he': 66,
 'most': 67,
 'by': 68,
 'in': 69,
 'still': 70,
 'never': 71,
 'close': 72,
 'little': 73,
 'such': 74,
 'know': 75,
 'hope': 76,
 'every': 77,
 'end': 78,
 '

In [16]:
labels = list(df['label'])

In [17]:
total_classes = list(set(labels))  ## or author
print("Total classes: {}".format(len(total_classes)))
number_books = (df[df['label'] == total_classes[0]]).shape[0]
print("Total entities for each class in train: {}".format(number_books))
dict_categories = {cat: index for index, cat in enumerate(total_classes)}

Total classes: 13
Total entities for each class in train: 6


In [18]:
y = [dict_categories[y] for y in labels]

In [19]:
total_classes

['Charles Dickens',
 'Allan Poe',
 'Arthur Conan Doyle',
 'Pelham Grenville',
 'Daniel Defoe',
 'Hector Hugh',
 'George Eliot',
 'Thomas Hardy',
 'Bram Stoker',
 'Mark Twain',
 'Charles Darwin',
 'Jane Austen',
 'Joseph Conrad']

In [20]:
def get_local_features(sequences, word_features, measures):
    all_features_container = []
    for text in sequences:
        obj = CNetwork(text, model=None, index_word=index_word, percentages=None, path="")
        network = obj.create_network()
        local_measure = obj.get_network_measures(network, word_features, word_index, measures)
        all_features_container.append(local_measure)
    return np.array(all_features_container)

In [21]:
X = get_local_features(selected_corpus, words_features, measures)

Nodes: 4031 - Edges: 15460
Len features: 157
[1.88610422 2.43920596 2.72382134 2.35111663 2.41116625 2.63151365
 2.12133995 2.60124069 2.49851117 2.33895782 2.38759305 2.49032258
 2.24689826 2.4439206  2.72605459 2.30545906 2.2662531  2.4751861
 2.21166253 2.92977667 2.95533499 2.32580645 1.91836228 2.29875931
 2.57866005 2.39255583 2.89081886 2.45980149 2.53126551 2.43052109
 2.47741935 2.57270471 2.65657568 2.20818859 2.18188586 2.30124069
 1.98610422 2.23970223 3.05756824 2.43920596 2.29751861 2.22258065
 2.32555831 2.55359801 2.04739454 2.49677419 2.17915633 2.7
 2.46426799 2.25657568 2.05905707 2.62406948 2.71588089 2.36004963
 2.68808933 0.         2.41290323 2.41836228 2.34292804 2.64565757
 2.23399504 2.54739454 2.41513648 2.30248139 2.37245658 2.53176179
 2.35558313 2.52233251 2.40322581 2.06377171 2.54292804 2.5528536
 2.9853598  2.56004963 2.60868486 2.35533499 2.74143921 2.88238213
 2.59950372 2.65186104 2.49131514 2.16674938 2.3528536  2.70744417
 2.88436725 2.2630273  2.4

Nodes: 4101 - Edges: 14931
Len features: 157
[1.92560976 2.42146341 2.71292683 2.38682927 2.54560976 3.74268293
 2.29804878 2.82731707 2.50780488 2.52268293 2.38414634 2.64658537
 2.41365854 2.45       2.79658537 2.42219512 2.31560976 2.35341463
 2.36268293 2.72097561 2.81292683 2.38731707 1.91292683 2.42219512
 2.58292683 2.33902439 2.46682927 2.51682927 2.60878049 2.44390244
 2.57731707 2.8595122  2.53902439 2.2095122  2.26463415 2.44341463
 2.07292683 2.28585366 3.0202439  2.61268293 2.32121951 2.25926829
 2.54292683 2.41926829 2.1302439  2.68170732 2.6404878  2.41512195
 2.40365854 2.42195122 2.07170732 2.79292683 2.83487805 2.66536585
 2.74878049 0.         2.39243902 2.44073171 2.31146341 2.60097561
 2.29609756 2.54317073 2.54682927 2.54804878 2.41341463 2.58853659
 2.2604878  2.69829268 2.28268293 2.1004878  2.67463415 2.71390244
 2.7504878  2.54829268 2.62317073 2.45292683 2.6997561  2.7997561
 2.50560976 2.76341463 2.49536585 2.37170732 2.65756098 2.53609756
 3.0395122  2.5434

Len features: 157
[1.89693654 2.53369803 2.82100656 2.37155361 2.36301969 3.002407
 2.1380744  2.67746171 2.45295405 2.58949672 2.44945295 2.48730853
 2.6595186  2.51487965 2.63544858 2.4868709  2.1809628  2.42603939
 2.3297593  2.72560175 2.71816193 2.24682713 1.90656455 2.43873085
 2.66148796 2.4380744  2.44485777 2.2833698  2.55229759 2.61291028
 2.49562363 2.4297593  2.57045952 2.26805252 2.18971554 2.30765864
 2.05733042 2.35032823 2.58118162 2.33916849 2.37002188 2.32582057
 2.24310722 2.46389497 2.10525164 2.54989059 2.39584245 2.57592998
 2.45820569 2.3238512  2.01466083 2.73129103 2.75908096 2.36936543
 2.62035011 0.         2.46586433 2.52319475 2.21794311 2.55295405
 2.25820569 2.49781182 2.67680525 2.6536105  2.30722101 2.46673961
 2.20262582 2.50656455 2.35820569 2.10897155 2.60328228 2.59343545
 2.78512035 2.53107221 2.49387309 2.37002188 2.63938731 2.63522976
 2.44376368 2.42538293 2.77636761 2.27133479 2.57002188 2.41422319
 2.84026258 2.3547046  2.30634573 2.24945295 2

Nodes: 4875 - Edges: 17300
Len features: 157
[1.91670086 2.43865408 2.71399261 2.28251949 2.41382848 0.
 2.15572425 2.69634797 2.57468199 2.40315962 2.41998359 2.68116537
 2.34366024 2.34407058 2.66270004 2.42039393 2.41875256 2.55211325
 2.35720148 2.68260156 2.62535905 2.21665983 1.8756668  2.46901929
 2.4620435  2.26466968 2.59150595 2.34345507 2.51661879 2.43311449
 2.52379975 2.61181781 2.42983176 2.2133771  2.12310217 2.40151826
 2.03528929 2.2113254  2.65141568 2.39556832 2.32704144 2.20537546
 2.35514977 2.46224867 2.06729586 2.46983997 2.6243332  2.28744358
 2.48440706 2.37771851 2.00369307 2.89659417 2.87094789 2.45055396
 2.58329914 0.         2.38838736 2.34263439 2.29503488 2.50041034
 2.24948707 2.36828067 2.63890029 2.30406237 2.26713172 2.50410341
 2.34263439 2.4848174  2.3073451  2.03569963 2.54390644 2.58658186
 3.12617973 2.43393517 2.40151826 2.34448092 2.73081658 2.76343865
 2.41382848 2.26015593 2.36233073 2.22425113 2.5379565  2.50266721
 2.88572015 2.39146492 2.

Len features: 157
[1.98521607 3.1853677  2.52198635 2.21569371 2.59097801 0.
 2.13874147 2.94313874 2.79150872 2.71948446 2.73237301 3.38968916
 3.09666414 2.29871114 2.56103108 2.78316907 3.23388931 2.62509477
 2.77786202 2.62812737 2.42949204 2.20090978 1.89651251 2.65163002
 2.56254738 2.46626232 2.69825625 2.62433662 2.86050038 2.73351024
 2.64177407 2.60386657 2.49886277 2.31993935 2.21114481 2.40788476
 2.16451857 2.42911296 2.39916603 2.52084913 2.49431387 2.59059894
 2.53146323 2.92115239 2.25966641 2.41736164 2.89499621 2.44541319
 2.53980288 2.32941622 2.01781653 2.55458681 3.05155421 2.74791509
 2.37793783 0.         2.53222138 2.89499621 2.78771797 2.72062168
 2.61144807 2.36959818 2.8510235  2.48862775 2.72365428 2.31387415
 2.64480667 2.44162244 2.23502654 2.06368461 3.11220622 2.67134193
 2.48597422 2.58339651 2.55307051 2.77824109 0.         2.86201668
 2.6535254  2.47308567 2.59438969 2.34116755 2.86580743 2.45716452
 2.8570887  2.60007582 3.01933283 2.32145565 2.47877

[1.95597281 3.29394626 2.94205244 2.1764325  2.30592425 0.
 2.10229848 2.82680479 2.80317255 2.7122046  2.72677242 2.7122046
 2.71026222 2.34703788 2.48850761 3.09550016 2.68695371 2.79087083
 2.61379087 3.12107478 2.46519909 2.26092587 1.8533506  2.61929427
 3.02913564 2.56943995 2.73065717 2.84202007 3.16477825 2.84363872
 2.75623179 2.57785691 2.52088054 2.26772418 2.28196827 2.65231466
 2.14988669 2.59889932 2.42764649 2.68857235 2.61994173 2.64098414
 3.07186792 2.35642603 2.08352218 2.68145031 2.56167044 2.407899
 2.77662674 2.30042085 1.94140499 2.72062156 3.23438006 2.63774684
 2.36516672 2.98025251 2.67594691 2.73292328 2.77500809 2.81158951
 2.66494011 2.31919715 2.89057948 2.83522175 2.34606669 2.49401101
 2.59307219 2.42084817 2.31693105 2.08643574 2.98802201 3.43962447
 2.67335707 2.52962124 2.78569116 2.80317255 2.92165749 2.81158951
 2.87406928 2.51893817 2.80867595 2.30689544 3.07445775 2.22434445
 3.27549369 2.66494011 2.64972483 2.40854646 2.81191324 2.91226934
 2.953

Len features: 157
[1.85229415 2.3566939  2.4512885  2.1920176  2.15964802 0.
 2.22690132 2.5779384  2.31112508 2.52388435 2.34286612 2.44594595
 2.31018228 2.3548083  2.52294155 2.23067253 2.24261471 2.50534255
 2.2027027  2.48240101 2.26492772 2.05939661 1.80672533 2.28252671
 2.43400377 2.21181647 2.64707731 2.34318039 2.53456945 2.34632307
 2.65713388 2.41797612 2.28001257 2.14896292 2.1433061  2.17630421
 2.01099937 2.16341923 2.27592709 2.3752357  2.31552483 2.20364551
 2.37115022 2.43054683 1.98020113 2.22438718 2.31238215 2.32118165
 2.43840352 2.19264613 1.83438089 2.58045255 2.287555   2.33092395
 2.44626021 0.         2.22438718 2.4230044  2.42143306 2.54902577
 2.24104337 2.34160905 2.34758014 2.19264613 2.16247643 2.28032684
 2.18541798 2.28881207 2.09616593 1.92991829 2.24230044 2.39974859
 2.76052797 2.29384035 2.22721559 2.33815211 2.2787555  2.5301697
 2.4522313  2.31301069 2.30138278 2.36800754 2.46134507 2.45851666
 2.25298554 2.33752357 2.24638592 2.10465116 2.315839

Len features: 157
[1.93066615 2.49116333 2.50864245 2.3239464  2.40512721 3.18858031
 2.18955137 2.54165857 2.41619732 2.45892406 2.34103709 2.44066809
 2.57428627 2.46785784 2.79685376 2.54534861 2.30005826 2.28646339
 2.38570596 2.73956108 2.79161002 2.2196543  1.88094776 2.45814721
 2.60167023 2.33715285 2.56282773 2.32472325 2.36880948 2.40745776
 2.38337541 2.36725578 2.33676442 2.2041173  2.19227034 2.4765974
 2.04253253 2.22936493 2.43231695 2.30297145 2.41056516 2.25752573
 2.38395805 2.2819965  2.10351525 2.39735871 2.45348611 2.42144106
 2.52631579 2.28296757 1.9906778  2.59312488 2.7461643  2.30743834
 2.41134201 0.         2.31501262 2.51796465 2.32841328 2.77820936
 2.23169547 2.31442999 2.63371528 2.66653719 2.33054962 2.33657021
 2.38376384 2.41775102 2.27908332 2.05573898 2.61604195 2.3831812
 2.89978637 2.34045446 2.34239658 2.42144106 2.53486114 2.47096524
 2.36531365 2.47290736 2.47737425 2.27539328 2.58166634 2.55098077
 2.51602253 2.51815887 2.42882113 2.34861138 2

Nodes: 3387 - Edges: 14584
Len features: 157
[1.88570585 2.51004135 2.47578263 2.20998228 2.46839929 0.
 2.22179563 2.64500886 2.34524513 2.97932664 2.30714708 2.49113999
 2.36444182 2.31305375 2.48139398 2.42232723 2.22031896 2.3021264
 2.33933845 2.81925576 2.47991731 2.20850561 1.79119905 2.34170112
 2.46692262 2.27082103 2.44063792 2.20023627 2.77288836 2.46042528
 2.58033077 2.36355582 2.21529829 2.17897224 2.18281158 2.25103367
 2.05995275 2.49970467 2.76668636 2.25162434 2.29710573 2.16804489
 2.37093916 2.2714117  2.05197874 2.34819846 2.51240402 2.54164206
 2.45126994 2.33195511 2.01388069 2.38157118 2.54932073 2.29238039
 2.47844064 0.         2.36030715 2.92025989 2.2223863  2.56408742
 2.21145895 2.49113999 2.46160662 2.50679268 2.25280567 2.2250443
 2.18428825 2.58357944 2.27347903 2.03544005 2.46574129 2.52864737
 2.70821028 2.22297696 2.56733609 2.56024808 2.46603662 2.48907265
 2.45776728 2.68340224 2.36355582 2.37625517 2.6024808  2.81866509
 2.52096869 2.48316598 2.36

Len features: 157
[1.87391182 2.33754563 2.48076383 2.28980623 2.21145746 0.
 2.09182814 2.56529065 2.30103903 2.62650941 2.45071609 2.46953103
 2.43414771 2.41814097 2.67031733 2.44369559 2.34568941 2.41561359
 2.21258074 2.46559955 2.37910699 2.15080034 1.87896658 2.36647009
 2.39399045 2.29008705 2.63998877 2.31844987 2.43358607 2.33277169
 2.38612749 2.41561359 2.31002527 2.15332772 2.20471778 2.28418983
 2.02780118 2.22942994 2.44818871 2.31957315 2.31451839 2.27801179
 2.33894973 2.35383319 2.03875316 2.38725077 2.41055883 2.40887391
 2.46559955 2.30665543 1.91126088 2.58522887 2.35832631 2.30244313
 2.68351587 0.         2.35186745 2.42993541 2.41645605 2.58045493
 2.23223814 2.30553215 2.44397641 2.25835439 2.28447065 2.28025835
 2.25526537 2.39623701 2.10053356 2.0322943  2.65065993 2.58157821
 2.54787981 2.36394271 2.32912103 2.28559393 2.32350463 2.43583263
 2.68463915 2.36141533 2.42740803 2.25217635 2.74052233 2.48048301
 2.35888795 2.34372367 2.27351867 2.10418422 2.45183

Len features: 157
[1.95888889 2.54592593 2.69925926 2.43388889 2.28240741 0.
 2.15074074 2.74851852 2.77222222 2.46648148 2.50314815 2.49092593
 2.67740741 2.44333333 2.71851852 2.61351852 2.41611111 2.33907407
 2.40111111 2.70592593 2.91462963 2.31555556 1.89981481 2.47666667
 2.67685185 2.36814815 2.99648148 2.54148148 2.64092593 2.57333333
 2.47092593 2.62833333 2.57555556 2.34148148 2.26222222 2.33592593
 2.11018519 2.39037037 2.6437037  2.54462963 2.47574074 2.31
 2.30018519 2.71962963 2.10148148 2.49592593 2.32907407 2.37074074
 2.48851852 2.30407407 2.0187037  2.64611111 2.54166667 2.39166667
 2.42851852 3.41333333 2.38185185 2.59037037 2.34166667 2.73166667
 2.34259259 2.47925926 2.56351852 2.67018519 2.50148148 2.50462963
 2.41962963 2.48277778 2.22814815 2.10351852 2.63611111 2.60296296
 2.78351852 2.37777778 2.55462963 2.52222222 2.75462963 2.55277778
 2.41722222 2.51148148 2.70796296 2.31148148 2.68185185 2.61407407
 2.71259259 2.51055556 2.48759259 2.26037037 2.71462963 2.

Len features: 157
[1.88075244 2.27141418 2.38293584 2.3191132  2.26268055 0.
 2.3228082  2.38931811 2.34800134 2.37252267 2.37319449 2.32079274
 2.43701713 2.23715149 2.59791737 2.53443063 2.32515956 2.18743702
 2.23144105 2.35136043 2.54887471 2.02015452 1.83909976 2.36345314
 2.41215989 2.20725563 2.87033927 2.33993954 2.58011421 2.28653006
 2.42626806 2.15619751 2.24655694 2.17568021 2.1555257  2.29425596
 1.99798455 2.15854887 2.36278132 2.35505542 2.22707424 2.25394693
 2.25092375 2.25629829 2.00167954 2.39166947 2.33993954 2.36614041
 2.27510917 2.2099429  1.87604971 2.83003023 2.4084649  2.31441048
 2.36882768 0.         2.19751428 2.41585489 2.24386967 2.77628485
 2.23681559 2.21632516 2.29727914 2.25461874 2.23177696 2.21430971
 2.17400067 2.34464226 2.15854887 2.04803493 2.62781323 2.51864293
 2.72657037 2.1827343  2.25260329 2.26872691 2.69062815 2.48303661
 2.58548875 2.27981189 2.24689285 2.19348337 2.7819953  2.36244541
 2.64091367 2.21598925 2.25730601 1.98387639 2.28921

Len features: 157
[1.89386085 2.50286494 2.43055935 2.23819918 2.38417462 0.
 2.20491132 2.5154161  2.39099591 3.19208731 2.28540246 2.26957708
 2.52087312 2.2526603  2.53587995 2.37953615 2.0952251  2.30122783
 2.28240109 2.6100955  2.78663029 2.17462483 1.80463847 2.38690314
 2.54215553 2.24502046 2.42755798 2.25402456 2.34652115 2.36425648
 2.36371078 2.37571623 2.3781719  2.20218281 2.15443383 2.24474761
 1.98444748 2.23765348 2.44529332 2.32032742 2.3675307  2.23601637
 2.27175989 2.29686221 2.00109141 2.31705321 2.4207367  2.36671214
 2.58062756 2.23901774 1.95934516 2.45593452 2.55252387 2.32933151
 2.44229195 0.         2.24420191 2.41500682 2.12332879 2.69085948
 2.23219645 2.39536153 2.50204638 2.44802183 2.26493861 2.36343793
 2.19536153 2.45811733 2.22783083 1.99809004 2.53833561 2.37380628
 2.84747613 2.35634379 2.47039563 2.24447476 2.80354707 2.49222374
 2.43028649 2.66657572 2.33424284 2.36698499 2.4521146  2.40900409
 2.63028649 2.24938608 2.31814461 2.07175989 2.39208

Len features: 157
[1.9242743  2.3087926  2.59991586 2.30185107 2.4623475  0.
 2.18742112 2.49621371 2.30227177 2.33382415 2.35717291 2.36895246
 2.38430795 2.35738326 2.66891039 2.2951199  2.26651241 2.3083719
 2.26882625 2.76419857 2.60580564 2.22738746 1.86390408 2.42974337
 2.42785023 2.26398822 3.04985276 2.2681952  2.49936895 2.53702146
 2.49200673 2.47812369 2.41522928 2.23348759 2.16070677 2.33887253
 2.03681111 2.18636937 2.5778292  2.34896929 2.36159024 2.21266302
 2.27324358 2.22191838 2.02208666 2.40008414 2.40849811 2.49453092
 2.40534287 2.28838873 2.03975599 2.75410181 2.57425326 2.38893563
 2.5765671  0.         2.26356752 2.27850231 2.32751367 2.3207825
 2.22044594 2.45877156 2.49474127 2.53996634 2.18826252 2.39419436
 2.35506942 2.49705511 2.20782499 2.08119478 2.42175011 2.58245688
 2.56037021 2.40197728 2.43815734 2.38241481 3.03050063 2.6659655
 2.48317207 2.38010097 2.37421119 2.24758098 2.64745477 2.55847707
 2.56163231 2.34791754 2.52881784 2.16891039 2.54480438

In [22]:
print("Lenght of features:", len(X[0]))

Lenght of features: 157


# Normalize data

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)

In [24]:
X = scaler.fit_transform(X)

# Classification

In [25]:
import classifierv2

In [26]:
obj = classifierv2.Classification(X, y)
scores = obj.classification()

Score 0.375
Score 0.8125
Score 0.4375
Score 1.0


# sin lemmatizar
Score 0.4375
Score 0.6875
Score 0.5
Score 0.75


# lematizando
Score 0.375
Score 0.8125
Score 0.4375
Score 1.0