In [1]:
from network import CNetwork

In [2]:
import pandas as pd
import numpy as np

In [3]:
import os

In [4]:
name_dataset = "dataset_1"
dataset_path = "datasetsv2/"
length_cut = 5000
random_flag = True
dimension = 124
feature_selection = 'common_words'

In [5]:
df = pd.read_csv(dataset_path + name_dataset + ".csv")

In [6]:
df.head(5)

Unnamed: 0,label,text,book
0,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Christine Gehring,...",Right Ho Jeeves
1,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Suzanne L. Shell, ...",Tales Of St Austin
2,Joseph Conrad,And that last\n\nword was the single word of ...,Victory
3,Joseph Conrad,\n\n\n\nIf I have ever had these gifts in any ...,Under Western Eyes
4,Bram Stoker,\n\n\n\n Lond...,The Lady Of The Shroud


In [7]:
from utils.text_processing import get_min_len_corpus

In [8]:
print("Min Length:", get_min_len_corpus(list(df["text"])))

Min Length: 55024


In [9]:
from utils.text_processing import get_corpus, get_random_corpus

In [10]:
texts = list(df['text'])

In [11]:
corpus, segmented_corpus = get_corpus(texts, length_cut)

In [12]:
selected_corpus, words_features, word_index, index_word = get_random_corpus(segmented_corpus, remove_punctuation=True, feature_selection = feature_selection)

In [13]:
wor = [(index_word[w],w) for w in selected_corpus[42]]

In [14]:
wor

[('those', '341'),
 ('things', '662'),
 ('to', '676'),
 ('a', '347'),
 ('diligent', '17370'),
 ('servant', '4356'),
 ('and', '558'),
 ('I', '617'),
 ('mention', '383'),
 ('it', '321'),
 ('because', '1427'),
 ('people', '176'),
 ('who', '707'),
 ('are', '166'),
 ('either', '3535'),
 ('transported', '10267'),
 ('or', '364'),
 ('otherwise', '6417'),
 ('trepanned', '17297'),
 ('into', '259'),
 ('those', '341'),
 ('places', '5827'),
 ('are', '166'),
 ('generally', '22'),
 ('thought', '200'),
 ('to', '676'),
 ('be', '863'),
 ('rendered', '4573'),
 ('miserable', '2155'),
 ('and', '558'),
 ('undone', '14979'),
 ('whereas', '12192'),
 ('on', '298'),
 ('the', '82'),
 ('contrary', '5000'),
 ('I', '617'),
 ('would', '154'),
 ('encourage', '8206'),
 ('them', '868'),
 ('upon', '789'),
 ('my', '785'),
 ('own', '659'),
 ('experience', '3461'),
 ('to', '676'),
 ('depend', '5071'),
 ('upon', '789'),
 ('it', '321'),
 ('that', '643'),
 ('if', '85'),
 ('their', '229'),
 ('own', '659'),
 ('diligence', '1737

In [15]:
len(words_features)

46

In [16]:
words_features

{'about': 0,
 'be': 1,
 'them': 2,
 'this': 3,
 'now': 4,
 'all': 5,
 'that': 6,
 'one': 7,
 'they': 8,
 'by': 9,
 'so': 10,
 'would': 11,
 'more': 12,
 'but': 13,
 'been': 14,
 'when': 15,
 'was': 16,
 'from': 17,
 'a': 18,
 'no': 19,
 'an': 20,
 'not': 21,
 'do': 22,
 'with': 23,
 'had': 24,
 'on': 25,
 'he': 26,
 'which': 27,
 'of': 28,
 'at': 29,
 'up': 30,
 'or': 31,
 'the': 32,
 'out': 33,
 'to': 34,
 'if': 35,
 'into': 36,
 'in': 37,
 'have': 38,
 'could': 39,
 'for': 40,
 'and': 41,
 'any': 42,
 'as': 43,
 'some': 44,
 'it': 45}

In [17]:
len(selected_corpus)

78

In [18]:
labels = list(df['label'])

In [19]:
total_classes = list(set(labels))  ## or author
print("Total classes: {}".format(len(total_classes)))
number_books = (df[df['label'] == total_classes[0]]).shape[0]
print("Total entities for each class in train: {}".format(number_books))
dict_categories = {cat: index for index, cat in enumerate(total_classes)}

Total classes: 13
Total entities for each class in train: 6


In [20]:
y = [dict_categories[y] for y in labels]

In [21]:
total_classes

['Charles Darwin',
 'Pelham Grenville',
 'Mark Twain',
 'Allan Poe',
 'Arthur Conan Doyle',
 'Hector Hugh',
 'Jane Austen',
 'Joseph Conrad',
 'Bram Stoker',
 'Daniel Defoe',
 'George Eliot',
 'Thomas Hardy',
 'Charles Dickens']

In [22]:
def struc2vec(operating_system, networks):
    input_path = "struc2vec/graph/"
    output_path = 'struc2vec/emb/'
    try: 
        os.mkdir(input_path)
        os.mkdir(output_path)
    except:
        print("Existe")
    for i, netw in enumerate(networks):
        with open((input_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".edgelist"), "w") as f:
            v_names = netw.vs["name"]
            for edge in netw.get_edgelist():
                (u,v) = edge
                f.write(v_names[u] +" "+ v_names[v]+'\n')
    for i, netw in enumerate(networks):
        if operating_system == 'linux':
            path_command = "python struc2vec/src/main.py --dimensions "+str(dimension)+" --input "+input_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".edgelist --output "+output_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".emb"
            #path_command = 'python struc2vec/src/graph2vec.py --input-path ' + in_network + ' --output-path ' + extra_file + ' --dimensions ' + str(512)
            os.system(path_command)
            print("struc2vec", str(i))

In [23]:
def read_struc2vec(output_path, networks, words_features):
    all_network_features = [] 
    for i, netw in enumerate(networks):
        network_features = [[] for _ in range(len(words_features))]
        with open(output_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".emb", "r") as f:
            lines = f.readlines()
            num_token, dim = lines[0].split()
            for l in lines[1:]:
                row = l.strip().split(" ")
                id_node = row[0]
                if index_word[id_node].lower() in words_features:
                    emb = [float(v) for v in row[1:]]
                    network_features[words_features[index_word[id_node].lower()]] = emb
        network_features = np.array(network_features).flatten()
        print(str(i), "len",len(network_features))
        all_network_features.append(network_features)
    return np.array(all_network_features)

In [24]:
def get_struc2vec_features(sequences, words_features, index_word):
    nets = []
    for text in sequences:
        obj = CNetwork(text, model=None, index_word=index_word, percentages=None, path="")
        network = obj.create_network()
        nets.append(network)
    struc2vec("linux", nets)
    X = read_struc2vec('struc2vec/emb/', nets, words_features)
    return X

In [None]:
X = get_struc2vec_features(selected_corpus, words_features, index_word)

Nodes: 908 - Edges: 2197
Nodes: 1347 - Edges: 3407
Nodes: 1263 - Edges: 3406
Nodes: 1373 - Edges: 3511
Nodes: 1151 - Edges: 3495
Nodes: 1413 - Edges: 3495
Nodes: 1302 - Edges: 3402
Nodes: 1204 - Edges: 3461
Nodes: 1183 - Edges: 3435
Nodes: 1258 - Edges: 3357
Nodes: 1503 - Edges: 3707
Nodes: 958 - Edges: 3099
Nodes: 1423 - Edges: 3668
Nodes: 1121 - Edges: 3346
Nodes: 1456 - Edges: 3605
Nodes: 869 - Edges: 2912
Nodes: 1363 - Edges: 3507
Nodes: 1419 - Edges: 3547
Nodes: 1292 - Edges: 3435
Nodes: 1358 - Edges: 3589
Nodes: 1644 - Edges: 3680
Nodes: 1209 - Edges: 3488
Nodes: 1141 - Edges: 3398
Nodes: 1319 - Edges: 3522
Nodes: 848 - Edges: 2419
Nodes: 1323 - Edges: 3554
Nodes: 1361 - Edges: 3541
Nodes: 1237 - Edges: 3367
Nodes: 1227 - Edges: 3460
Nodes: 1318 - Edges: 3501
Nodes: 1155 - Edges: 3185
Nodes: 1371 - Edges: 3583
Nodes: 1043 - Edges: 2945
Nodes: 1277 - Edges: 3493
Nodes: 911 - Edges: 2942
Nodes: 1249 - Edges: 3525
Nodes: 1160 - Edges: 3318
Nodes: 1518 - Edges: 3682
Nodes: 1489 - Edg

In [None]:
print("Lenght of features:", X.shape)

In [None]:
X

In [None]:
len(X[0])

# Training with SVM

In [None]:
import classifierv2

In [None]:
obj = classifierv2.Classification(X, y)
scores = obj.classification()