In [1]:
from network import CNetwork

In [2]:
import pandas as pd
import numpy as np

In [3]:
import os

In [4]:
name_dataset = "dataset_1"
dataset_path = "datasetsv2/"
length_cut = 200
random_flag = 15
dimension = 124
feature_selection = 'common_words'

In [5]:
df = pd.read_csv(dataset_path + name_dataset + ".csv")

In [6]:
df.head(5)

Unnamed: 0,label,text,book
0,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Christine Gehring,...",Right Ho Jeeves
1,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Suzanne L. Shell, ...",Tales Of St Austin
2,Joseph Conrad,And that last\n\nword was the single word of ...,Victory
3,Joseph Conrad,\n\n\n\nIf I have ever had these gifts in any ...,Under Western Eyes
4,Bram Stoker,\n\n\n\n Lond...,The Lady Of The Shroud


In [7]:
from utils.text_processing import get_min_len_corpus

In [8]:
min_len_book = get_min_len_corpus(list(df["text"]))

In [9]:
print("Min Length:", min_len_book)

Min Length: 55024


In [10]:
from utils.text_processing import partition_text, get_process_corpus

In [11]:
texts = list(df['text'])

In [12]:
labels = list(df['label'])

In [13]:
corpus, segmented_corpus, labels = partition_text(texts, labels, length_cut, min_len_book, random_flag)

Max partitions:  275


In [14]:
selected_corpus, words_features, word_index, index_word = get_process_corpus(segmented_corpus, remove_punctuation=True, feature_selection = feature_selection)

In [15]:
wor = [(index_word[w],w) for w in selected_corpus[1]]

In [16]:
wor

 ('me', '96'),
 ('in', '42'),
 ('a', '51'),
 ('motherly', '175'),
 ('voice', '191'),
 ('not', '38'),
 ('to', '120'),
 ('take', '62'),
 ('the', '48'),
 ('stairs', '181'),
 ('too', '182'),
 ('quickly', '178'),
 ('After', '187'),
 ('seven', '162'),
 ('or', '154'),
 ('eight', '130'),
 ('solid', '174'),
 ('meals', '158'),
 ('she', '102'),
 ('said', '161'),
 ('a', '51'),
 ('man', '152'),
 ('of', '100'),
 ('my', '113'),
 ('build', '123'),
 ('ought', '163'),
 ('to', '120'),
 ('be', '190'),
 ('very', '193'),
 ('careful', '189'),
 ('because', '164'),
 ('of', '100'),
 ('the', '48'),
 ('danger', '188'),
 ('of', '100'),
 ('apoplectic', '143'),
 ('fits', '159'),
 ('She', '133'),
 ('said', '161'),
 ('it', '39'),
 ('was', '105'),
 ('the', '48'),
 ('same', '155'),
 ('with', '4'),
 ('dogs', '127'),
 ('When', '126'),
 ('they', '186'),
 ('became', '171'),
 ('very', '193'),
 ('fat', '197'),
 ('and', '115'),
 ('overfed', '185'),
 ('you', '116'),
 ('had', '19'),
 ('to', '120'),
 ('see', '194'),
 ('that', '11

In [17]:
len(words_features)

1

In [18]:
words_features

{'the': 0}

In [19]:
word_index["the"]

'48'

In [20]:
len(selected_corpus)

1170

In [21]:
total_classes = list(set(labels))  ## or author
print("Total classes: {}".format(len(total_classes)))
number_books = (df[df['label'] == total_classes[0]]).shape[0]
print("Total entities for each class in train: {}".format(number_books))
dict_categories = {cat: index for index, cat in enumerate(total_classes)}

Total classes: 13
Total entities for each class in train: 6


In [22]:
y = [dict_categories[y] for y in labels]

In [23]:
total_classes

['Mark Twain',
 'Thomas Hardy',
 'Charles Dickens',
 'Jane Austen',
 'Bram Stoker',
 'Arthur Conan Doyle',
 'Daniel Defoe',
 'Pelham Grenville',
 'Allan Poe',
 'Joseph Conrad',
 'George Eliot',
 'Charles Darwin',
 'Hector Hugh']

In [24]:
input_path = "struc2vec/graph/"+str(length_cut)+"_"+str(dimension)+"/"
output_path = "struc2vec/emb/"+str(length_cut)+"_"+str(dimension)+"/"    

In [25]:
def read_struc2vec(networks, words_features):
    all_network_features = []
    for i, netw in enumerate(networks):
        network_features = [[] for _ in range(len(words_features))]
        with open(output_path+str(i)+".emb", "r") as f:
            lines = f.readlines()
            num_token, dim = lines[0].split()
            for l in lines[1:]:
                row = l.strip().split(" ")
                id_node = row[0]
                if index_word[id_node].lower() in words_features:
                    emb = [float(v) for v in row[1:]]
                    network_features[words_features[index_word[id_node].lower()]] = emb
            network_features = np.array(network_features).flatten()
            print(str(i), "len",len(network_features))
            all_network_features.append(network_features)   
    return np.array(all_network_features)

In [26]:
def struc2vec(operating_system, networks):
    try: 
        os.mkdir(input_path)
        os.mkdir(output_path)
    except:
        print("Existe")
    for i, netw in enumerate(networks):
        with open((input_path+str(i)+".edgelist"), "w") as f:
            v_names = netw.vs["name"]
            for edge in netw.get_edgelist():
                (u,v) = edge
                f.write(v_names[u] +" "+ v_names[v]+'\n')
    for i, netw in enumerate(networks):
        if operating_system == 'linux':
            path_command = "python struc2vec/src/main.py --dimensions "+str(dimension)+" --input "+input_path+str(i)+".edgelist --output "+output_path+str(i)+".emb"
            #path_command = 'python struc2vec/src/graph2vec.py --input-path ' + in_network + ' --output-path ' + extra_file + ' --dimensions ' + str(512)
            os.system(path_command)
            print("struc2vec", str(i),len(netw.vs["name"]))

In [27]:
def get_struc2vec_features(sequences, words_features, index_word):
    nets = []
    for text in sequences:
        obj = CNetwork(text, model=None, index_word=index_word, percentages=None, path="")
        network = obj.create_network()
        nets.append(network)
    #struc2vec("linux", nets)
    X = read_struc2vec(nets, words_features)
    return X

In [28]:
X = get_struc2vec_features(selected_corpus, words_features, index_word)

Nodes: 122 - Edges: 178
Nodes: 118 - Edges: 164
Nodes: 129 - Edges: 169
Nodes: 127 - Edges: 176
Nodes: 107 - Edges: 156
Nodes: 112 - Edges: 169
Nodes: 109 - Edges: 164
Nodes: 101 - Edges: 150
Nodes: 130 - Edges: 174
Nodes: 120 - Edges: 168
Nodes: 114 - Edges: 156
Nodes: 128 - Edges: 172
Nodes: 102 - Edges: 154
Nodes: 107 - Edges: 153
Nodes: 110 - Edges: 155
Nodes: 113 - Edges: 158
Nodes: 111 - Edges: 165
Nodes: 116 - Edges: 175
Nodes: 101 - Edges: 158
Nodes: 104 - Edges: 142
Nodes: 101 - Edges: 142
Nodes: 116 - Edges: 175
Nodes: 106 - Edges: 137
Nodes: 106 - Edges: 142
Nodes: 110 - Edges: 159
Nodes: 125 - Edges: 174
Nodes: 107 - Edges: 157
Nodes: 118 - Edges: 163
Nodes: 112 - Edges: 160
Nodes: 110 - Edges: 159
Nodes: 116 - Edges: 170
Nodes: 122 - Edges: 168
Nodes: 118 - Edges: 166
Nodes: 128 - Edges: 176
Nodes: 116 - Edges: 170
Nodes: 119 - Edges: 170
Nodes: 120 - Edges: 170
Nodes: 119 - Edges: 169
Nodes: 123 - Edges: 170
Nodes: 127 - Edges: 178
Nodes: 118 - Edges: 163
Nodes: 116 - Edg

Nodes: 117 - Edges: 166
Nodes: 105 - Edges: 169
Nodes: 123 - Edges: 171
Nodes: 121 - Edges: 172
Nodes: 115 - Edges: 163
Nodes: 118 - Edges: 174
Nodes: 125 - Edges: 176
Nodes: 117 - Edges: 168
Nodes: 115 - Edges: 175
Nodes: 120 - Edges: 170
Nodes: 118 - Edges: 175
Nodes: 98 - Edges: 148
Nodes: 93 - Edges: 147
Nodes: 95 - Edges: 149
Nodes: 117 - Edges: 164
Nodes: 121 - Edges: 165
Nodes: 118 - Edges: 168
Nodes: 107 - Edges: 154
Nodes: 119 - Edges: 165
Nodes: 110 - Edges: 161
Nodes: 73 - Edges: 117
Nodes: 73 - Edges: 117
Nodes: 117 - Edges: 163
Nodes: 116 - Edges: 162
Nodes: 106 - Edges: 161
Nodes: 114 - Edges: 161
Nodes: 127 - Edges: 171
Nodes: 130 - Edges: 179
Nodes: 126 - Edges: 176
Nodes: 120 - Edges: 174
Nodes: 117 - Edges: 176
Nodes: 130 - Edges: 179
Nodes: 117 - Edges: 168
Nodes: 120 - Edges: 173
Nodes: 119 - Edges: 170
Nodes: 113 - Edges: 164
Nodes: 106 - Edges: 154
Nodes: 117 - Edges: 172
Nodes: 113 - Edges: 162
Nodes: 115 - Edges: 164
Nodes: 115 - Edges: 170
Nodes: 115 - Edges: 1

Nodes: 115 - Edges: 164
Nodes: 117 - Edges: 171
Nodes: 108 - Edges: 166
Nodes: 120 - Edges: 170
Nodes: 99 - Edges: 152
Nodes: 110 - Edges: 163
Nodes: 111 - Edges: 160
Nodes: 113 - Edges: 168
Nodes: 119 - Edges: 165
Nodes: 118 - Edges: 174
Nodes: 118 - Edges: 168
Nodes: 112 - Edges: 173
Nodes: 109 - Edges: 162
Nodes: 122 - Edges: 168
Nodes: 118 - Edges: 170
Nodes: 110 - Edges: 165
Nodes: 105 - Edges: 162
Nodes: 128 - Edges: 174
Nodes: 107 - Edges: 169
Nodes: 112 - Edges: 171
Nodes: 117 - Edges: 168
Nodes: 117 - Edges: 169
Nodes: 125 - Edges: 178
Nodes: 121 - Edges: 169
Nodes: 123 - Edges: 167
Nodes: 125 - Edges: 172
Nodes: 124 - Edges: 172
Nodes: 119 - Edges: 164
Nodes: 119 - Edges: 171
Nodes: 113 - Edges: 169
Nodes: 129 - Edges: 177
Nodes: 117 - Edges: 171
Nodes: 124 - Edges: 177
Nodes: 126 - Edges: 168
Nodes: 114 - Edges: 168
Nodes: 128 - Edges: 170
Nodes: 114 - Edges: 170
Nodes: 111 - Edges: 160
Nodes: 118 - Edges: 160
Nodes: 125 - Edges: 172
Nodes: 108 - Edges: 167
Nodes: 109 - Edge

432 len 0
433 len 124
434 len 0
435 len 124
436 len 124
437 len 0
438 len 0
439 len 0
440 len 0
441 len 0
442 len 0
443 len 0
444 len 0
445 len 0
446 len 0
447 len 124
448 len 124
449 len 0
450 len 0
451 len 0
452 len 0
453 len 0
454 len 0
455 len 0
456 len 0
457 len 0
458 len 0
459 len 0
460 len 0
461 len 0
462 len 0
463 len 0
464 len 0
465 len 0
466 len 0
467 len 124
468 len 124
469 len 0
470 len 0
471 len 0
472 len 0
473 len 0
474 len 0
475 len 0
476 len 0
477 len 0
478 len 0
479 len 0
480 len 0
481 len 0
482 len 0
483 len 0
484 len 0
485 len 0
486 len 0
487 len 0
488 len 0
489 len 0
490 len 0
491 len 0
492 len 0
493 len 124
494 len 0
495 len 0
496 len 0
497 len 0
498 len 0
499 len 0
500 len 124
501 len 0
502 len 0
503 len 0
504 len 0
505 len 0
506 len 0
507 len 0
508 len 0
509 len 0
510 len 0
511 len 0
512 len 0
513 len 0
514 len 0
515 len 0
516 len 0
517 len 0
518 len 0
519 len 0
520 len 0
521 len 0
522 len 0
523 len 0
524 len 124
525 len 0
526 len 0
527 len 0
528 len 124
529 len 

  return np.array(all_network_features)


In [29]:
print("Lenght of features:", X.shape)

Lenght of features: (1170,)


In [30]:
X

array([array([ 0.41279846, -0.0608338 , -0.12076033,  0.17052706, -0.11605065,
       -0.05274131,  0.12131938,  0.21349064, -0.39235717,  0.20845792,
       -0.19599994, -0.00659409,  0.18439856, -0.3809996 ,  0.43798998,
        0.01143217,  0.28755802, -0.06284866, -0.4504198 ,  0.13468446,
        0.03476137,  0.09723938, -0.10732662, -0.18325613,  0.15379958,
        0.11923994,  0.08530959, -0.06752159, -0.33032832, -0.07179426,
        0.25808403,  0.10187919,  0.10099983, -0.20667213,  0.02208275,
        0.09544599,  0.1235816 ,  0.11590873, -0.11671177,  0.14877038,
       -0.11625184, -0.02539472,  0.14873852,  0.20287444, -0.04906128,
       -0.02836693, -0.10774559,  0.2857499 , -0.10352693, -0.16993089,
       -0.05885651, -0.13693398, -0.00411682, -0.3314352 ,  0.08595452,
       -0.01290096,  0.00970929,  0.09846822,  0.01947102,  0.03785102,
        0.02129123, -0.00838204, -0.01792106,  0.18820788, -0.05063286,
        0.1994746 ,  0.18627153, -0.04987981, -0.31966007

In [31]:
len(X[0])

124

# Training with SVM

In [32]:
import classifierv2

In [33]:
obj = classifierv2.Classification(X, y)
scores = obj.classification()

ValueError: setting an array element with a sequence.