In [1]:
from network import CNetwork

In [2]:
import pandas as pd
import numpy as np

In [3]:
import os

In [4]:
name_dataset = "dataset_1"
dataset_path = "datasetsv2/"
length_cut = 2000
random_flag = True
dimension = 124
feature_selection = 'common_words'

In [5]:
df = pd.read_csv(dataset_path + name_dataset + ".csv")

In [6]:
df.head(5)

Unnamed: 0,label,text,book
0,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Christine Gehring,...",Right Ho Jeeves
1,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Suzanne L. Shell, ...",Tales Of St Austin
2,Joseph Conrad,And that last\n\nword was the single word of ...,Victory
3,Joseph Conrad,\n\n\n\nIf I have ever had these gifts in any ...,Under Western Eyes
4,Bram Stoker,\n\n\n\n Lond...,The Lady Of The Shroud


In [7]:
from utils.text_processing import get_min_len_corpus

In [8]:
print("Min Length:", get_min_len_corpus(list(df["text"])))

Min Length: 55024


In [9]:
from utils.text_processing import get_corpus, get_random_corpus

In [10]:
texts = list(df['text'])

In [11]:
corpus, segmented_corpus = get_corpus(texts, length_cut)

In [12]:
selected_corpus, words_features, word_index, index_word = get_random_corpus(segmented_corpus, remove_punctuation=True, feature_selection = feature_selection)

In [13]:
wor = [(index_word[w],w) for w in selected_corpus[42]]

In [14]:
wor

[('in', '350'),
 ('particular', '3015'),
 ('espousing', '10358'),
 ('her', '348'),
 ('own', '1274'),
 ('unsufferable', '10346'),
 ('levity', '10347'),
 ('that', '113'),
 ('there', '565'),
 ('was', '77'),
 ('indeed', '1703'),
 ('no', '223'),
 ('possibility', '7945'),
 ('of', '210'),
 ('our', '1260'),
 ('coming', '831'),
 ('afterwards', '2905'),
 ('to', '401'),
 ('any', '571'),
 ('terms', '1600'),
 ('again', '299'),
 ('However', '178'),
 ('I', '400'),
 ('kept', '1198'),
 ('a', '630'),
 ('couple', '3549'),
 ('of', '210'),
 ('trusty', '10360'),
 ('agents', '8249'),
 ('so', '552'),
 ('near', '2573'),
 ('her', '348'),
 ('that', '113'),
 ('I', '400'),
 ('failed', '693'),
 ('not', '132'),
 ('to', '401'),
 ('have', '108'),
 ('a', '630'),
 ('full', '752'),
 ('account', '1137'),
 ('of', '210'),
 ('her', '348'),
 ('conduct', '890'),
 ('though', '1500'),
 ('I', '400'),
 ('never', '580'),
 ('let', '1638'),
 ('her', '348'),
 ('know', '237'),
 ('anything', '163'),
 ('of', '210'),
 ('me', '114'),
 ('bu

In [15]:
len(words_features)

22

In [16]:
words_features

{'of': 0,
 'one': 1,
 'an': 2,
 'this': 3,
 'from': 4,
 'more': 5,
 'and': 6,
 'for': 7,
 'the': 8,
 'as': 9,
 'a': 10,
 'but': 11,
 'be': 12,
 'with': 13,
 'so': 14,
 'in': 15,
 'at': 16,
 'have': 17,
 'that': 18,
 'it': 19,
 'to': 20,
 'all': 21}

In [17]:
len(selected_corpus)

78

In [18]:
labels = list(df['label'])

In [19]:
total_classes = list(set(labels))  ## or author
print("Total classes: {}".format(len(total_classes)))
number_books = (df[df['label'] == total_classes[0]]).shape[0]
print("Total entities for each class in train: {}".format(number_books))
dict_categories = {cat: index for index, cat in enumerate(total_classes)}

Total classes: 13
Total entities for each class in train: 6


In [20]:
y = [dict_categories[y] for y in labels]

In [21]:
total_classes

['Bram Stoker',
 'Charles Dickens',
 'Thomas Hardy',
 'Mark Twain',
 'George Eliot',
 'Daniel Defoe',
 'Allan Poe',
 'Joseph Conrad',
 'Jane Austen',
 'Arthur Conan Doyle',
 'Charles Darwin',
 'Hector Hugh',
 'Pelham Grenville']

In [22]:
def struc2vec(operating_system, networks):
    input_path = "struc2vec/graph/"
    output_path = 'struc2vec/emb/'
    try: 
        os.mkdir(input_path)
        os.mkdir(output_path)
    except:
        print("Existe")
    for i, netw in enumerate(networks):
        with open((input_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".edgelist"), "w") as f:
            v_names = netw.vs["name"]
            for edge in netw.get_edgelist():
                (u,v) = edge
                f.write(v_names[u] +" "+ v_names[v]+'\n')
    for i, netw in enumerate(networks):
        if operating_system == 'linux':
            path_command = "python struc2vec/src/main.py --dimensions "+str(dimension)+" --input "+input_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".edgelist --output "+output_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".emb"
            #path_command = 'python struc2vec/src/graph2vec.py --input-path ' + in_network + ' --output-path ' + extra_file + ' --dimensions ' + str(512)
            os.system(path_command)
            print("struc2vec", str(i))

In [23]:
def read_struc2vec(output_path, networks, words_features):
    all_network_features = [] 
    for i, netw in enumerate(networks):
        network_features = [[] for _ in range(len(words_features))]
        with open(output_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".emb", "r") as f:
            lines = f.readlines()
            num_token, dim = lines[0].split()
            for l in lines[1:]:
                row = l.strip().split(" ")
                id_node = row[0]
                if index_word[id_node].lower() in words_features:
                    emb = [float(v) for v in row[1:]]
                    network_features[words_features[index_word[id_node].lower()]] = emb
        network_features = np.array(network_features).flatten()
        print(str(i), "len",len(network_features))
        all_network_features.append(network_features)
    return np.array(all_network_features)

In [24]:
def get_struc2vec_features(sequences, words_features, index_word):
    nets = []
    for text in sequences:
        obj = CNetwork(text, model=None, index_word=index_word, percentages=None, path="")
        network = obj.create_network()
        nets.append(network)
    struc2vec("linux", nets)
    X = read_struc2vec('struc2vec/emb/', nets, words_features)
    return X

In [None]:
X = get_struc2vec_features(selected_corpus, words_features, index_word)

Nodes: 648 - Edges: 1458
Nodes: 702 - Edges: 1500
Nodes: 630 - Edges: 1468
Nodes: 655 - Edges: 1480
Nodes: 683 - Edges: 1482
Nodes: 714 - Edges: 1489
Nodes: 662 - Edges: 1446
Nodes: 700 - Edges: 1551
Nodes: 613 - Edges: 1501
Nodes: 701 - Edges: 1534
Nodes: 747 - Edges: 1553
Nodes: 536 - Edges: 1411
Nodes: 743 - Edges: 1562
Nodes: 582 - Edges: 1446
Nodes: 678 - Edges: 1476
Nodes: 517 - Edges: 1379
Nodes: 741 - Edges: 1525
Nodes: 665 - Edges: 1483
Nodes: 644 - Edges: 1523
Nodes: 739 - Edges: 1550
Nodes: 767 - Edges: 1548
Nodes: 629 - Edges: 1489
Nodes: 579 - Edges: 1438
Nodes: 688 - Edges: 1486
Nodes: 540 - Edges: 1341
Nodes: 704 - Edges: 1517
Nodes: 748 - Edges: 1522
Nodes: 663 - Edges: 1469
Nodes: 660 - Edges: 1499
Nodes: 643 - Edges: 1466
Nodes: 638 - Edges: 1431
Nodes: 801 - Edges: 1566
Nodes: 643 - Edges: 1358
Nodes: 705 - Edges: 1585
Nodes: 564 - Edges: 1441
Nodes: 649 - Edges: 1549
Nodes: 646 - Edges: 1500
Nodes: 763 - Edges: 1587
Nodes: 775 - Edges: 1537
Nodes: 752 - Edges: 1583


  e_list = [x / sum_w for x in e_list]


struc2vec 6
struc2vec 7
struc2vec 8
struc2vec 9
struc2vec 10
struc2vec 11
struc2vec 12
struc2vec 13
struc2vec 14
struc2vec 15
struc2vec 16
struc2vec 17
struc2vec 18
struc2vec 19
struc2vec 20
struc2vec 21
struc2vec 22
struc2vec 23
struc2vec 24
struc2vec 25
struc2vec 26
struc2vec 27
struc2vec 28
struc2vec 29
struc2vec 30
struc2vec 31
struc2vec 32
struc2vec 33
struc2vec 34


In [None]:
print("Lenght of features:", X.shape)

In [None]:
X

In [None]:
len(X[0])

# Training with SVM

In [None]:
import classifierv2

In [None]:
obj = classifierv2.Classification(X, y)
scores = obj.classification()