In [1]:
from network import CNetwork

In [2]:
import pandas as pd
import numpy as np

In [3]:
import os

In [4]:
name_dataset = "dataset_1"
dataset_path = "datasetsv2/"
length_cut = 500
random_flag = True
dimension = 2
feature_selection = 'common_words'

In [5]:
df = pd.read_csv(dataset_path + name_dataset + ".csv")

In [6]:
df.head(5)

Unnamed: 0,label,text,book
0,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Christine Gehring,...",Right Ho Jeeves
1,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Suzanne L. Shell, ...",Tales Of St Austin
2,Joseph Conrad,And that last\n\nword was the single word of ...,Victory
3,Joseph Conrad,\n\n\n\nIf I have ever had these gifts in any ...,Under Western Eyes
4,Bram Stoker,\n\n\n\n Lond...,The Lady Of The Shroud


In [7]:
from utils.text_processing import get_min_len_corpus

In [8]:
print("Min Length:", get_min_len_corpus(list(df["text"])))

Min Length: 47826


In [9]:
from utils.text_processing import get_corpus, get_random_corpus

In [10]:
texts = list(df['text'])

In [11]:
corpus, segmented_corpus = get_corpus(texts, length_cut)

In [12]:
from utils.text_processing import remove_puntuaction

In [13]:
segmented_corpus = [[[remove_puntuaction(w) for w in t] for t in p] for p in segmented_corpus ]

In [14]:
segmented_corpus[0]

[['Produced',
  'by',
  'Christine',
  'Gehring',
  'Richard',
  'Prairie',
  'and',
  'the',
  'Project',
  'Gutenberg',
  'Online',
  'Distributed',
  'Proofreading',
  'Team',
  'RIGHT',
  'HO',
  'JEEVES',
  'By',
  'P',
  'G',
  'WODEHOUSE',
  '1922',
  'To',
  'RAYMOND',
  'NEEDHAM',
  'KC',
  'WITH',
  'AFFECTION',
  'AND',
  'ADMIRATION',
  '1',
  'Jeeves',
  'I',
  'said',
  'may',
  'I',
  'speak',
  'frankly',
  'Certainly',
  'sir',
  'What',
  'I',
  'have',
  'to',
  'say',
  'may',
  'wound',
  'you',
  'Not',
  'at',
  'all',
  'sir',
  'Well',
  'then',
  'Nowait',
  'Hold',
  'the',
  'line',
  'a',
  'minute',
  'Ive',
  'gone',
  'off',
  'the',
  'rails',
  'I',
  'dont',
  'know',
  'if',
  'you',
  'have',
  'had',
  'the',
  'same',
  'experience',
  'but',
  'the',
  'snag',
  'I',
  'always',
  'come',
  'up',
  'against',
  'when',
  'Im',
  'telling',
  'a',
  'story',
  'is',
  'this',
  'dashed',
  'difficult',
  'problem',
  'of',
  'where',
  'to',
  'be

In [15]:
selected_corpus, words_features, word_index, index_word = get_random_corpus(segmented_corpus,tokenized=True,feature_selection = feature_selection)

In [16]:
len(words_features)

6

In [17]:
words_features

{'a': 0, 'to': 1, 'of': 2, 'in': 3, 'and': 4, 'the': 5}

In [18]:
index_word["133"]

'night'

In [19]:
len(selected_corpus)

78

In [20]:
labels = list(df['label'])

In [21]:
total_classes = list(set(labels))  ## or author
print("Total classes: {}".format(len(total_classes)))
number_books = (df[df['label'] == total_classes[0]]).shape[0]
print("Total entities for each class in train: {}".format(number_books))
dict_categories = {cat: index for index, cat in enumerate(total_classes)}

Total classes: 13
Total entities for each class in train: 6


In [22]:
y = [dict_categories[y] for y in labels]

In [23]:
total_classes

['Charles Darwin',
 'Daniel Defoe',
 'Charles Dickens',
 'Joseph Conrad',
 'Bram Stoker',
 'Mark Twain',
 'George Eliot',
 'Allan Poe',
 'Pelham Grenville',
 'Arthur Conan Doyle',
 'Hector Hugh',
 'Jane Austen',
 'Thomas Hardy']

In [24]:
def struc2vec(operating_system, networks):
    input_path = "struc2vec/graph/"
    output_path = 'struc2vec/emb/'
    try: 
        os.mkdir(input_path)
        os.mkdir(output_path)
    except:
        print("Existe")
    for i, netw in enumerate(networks):
        netw.write_edgelist(input_path+str(i)+".edgelist")
    for i, netw in enumerate(networks):
        if operating_system == 'linux':
            path_command = "python struc2vec/src/main.py --dimensions "+str(dimension)+" --input "+input_path+str(i)+".edgelist --output "+output_path+str(i)+".emb"
            #path_command = 'python struc2vec/src/graph2vec.py --input-path ' + in_network + ' --output-path ' + extra_file + ' --dimensions ' + str(512)
            os.system(path_command)
            print("struc2vec", str(i))

In [25]:
def read_struc2vec(output_path, networks, words_features):
    all_network_features = [] 
    for i, netw in enumerate(networks):
        network_features = [[] for _ in range(len(words_features))]
        with open(output_path+str(i)+".emb", "r") as f:
            lines = f.readlines()
            num_token, dim = lines[0].split()
            for l in lines[1:]:
                row = l.strip().split(" ")
                id_node = row[0]
                if index_word[id_node].lower() in words_features:
                    emb = [float(v) for v in row[1:]]
                    #print(id_node)
                    network_features[words_features[index_word[id_node].lower()]] = emb
        #print(network_features)
        network_features = np.array(network_features).flatten()
        print(len(network_features))
        #print(network_features)
        all_network_features.append(network_features)
    return np.array(all_network_features)

In [26]:
def get_struc2vec_features(sequences, words_features, index_word):
    nets = []
    for text in sequences:
        obj = CNetwork(text, model=None, index_word=index_word, percentages=None, path="")
        network = obj.create_network()
        nets.append(network)
    struc2vec("linux", nets)
    X = read_struc2vec('struc2vec/emb/', nets, words_features)
    return X

In [None]:
X = get_struc2vec_features(selected_corpus, words_features, index_word)

Nodes: 288 - Edges: 470
Nodes: 266 - Edges: 464
Nodes: 276 - Edges: 463
Nodes: 304 - Edges: 469
Nodes: 258 - Edges: 454
Nodes: 242 - Edges: 448
Nodes: 236 - Edges: 413
Nodes: 246 - Edges: 440
Nodes: 260 - Edges: 453
Nodes: 247 - Edges: 441
Nodes: 288 - Edges: 464
Nodes: 257 - Edges: 456
Nodes: 259 - Edges: 467
Nodes: 250 - Edges: 462
Nodes: 271 - Edges: 466
Nodes: 218 - Edges: 448
Nodes: 295 - Edges: 470
Nodes: 289 - Edges: 470
Nodes: 288 - Edges: 467
Nodes: 281 - Edges: 474
Nodes: 301 - Edges: 476
Nodes: 242 - Edges: 452
Nodes: 271 - Edges: 477
Nodes: 284 - Edges: 469
Nodes: 266 - Edges: 442
Nodes: 267 - Edges: 465
Nodes: 276 - Edges: 463
Nodes: 283 - Edges: 466
Nodes: 266 - Edges: 464
Nodes: 298 - Edges: 466
Nodes: 241 - Edges: 441
Nodes: 279 - Edges: 457
Nodes: 258 - Edges: 439
Nodes: 264 - Edges: 467
Nodes: 237 - Edges: 455
Nodes: 233 - Edges: 444
Nodes: 253 - Edges: 469
Nodes: 268 - Edges: 465
Nodes: 283 - Edges: 469
Nodes: 288 - Edges: 462
Nodes: 307 - Edges: 472
Nodes: 295 - Edg

concurrent.futures.process._RemoteTraceback: 
"""
concurrent.futures.process._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/lib/python3.8/concurrent/futures/process.py", line 239, in _process_worker
    r = call_item.fn(*call_item.args, **call_item.kwargs)
  File "/home/mailaucq/Documents/book_classification/struc2vec/src/algorithms_distances.py", line 520, in generate_distances_network_part3
    weights_distances = restoreVariableFromDisk('weights_distances-layer-'+str(layer))
  File "/home/mailaucq/Documents/book_classification/struc2vec/src/utils.py", line 30, in restoreVariableFromDisk
    with open(folder_pickles + name + '.pickle', 'rb') as handle:
FileNotFoundError: [Errno 2] No such file or directory: '/home/mailaucq/Documents/book_classification/struc2vec/src/../pickles/weights_distances-layer-6.pickle'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/lib/python3.8/concurrent/futu

struc2vec 16


concurrent.futures.process._RemoteTraceback: 
"""
concurrent.futures.process._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/lib/python3.8/concurrent/futures/process.py", line 239, in _process_worker
    r = call_item.fn(*call_item.args, **call_item.kwargs)
  File "/home/mailaucq/Documents/book_classification/struc2vec/src/algorithms_distances.py", line 520, in generate_distances_network_part3
    weights_distances = restoreVariableFromDisk('weights_distances-layer-'+str(layer))
  File "/home/mailaucq/Documents/book_classification/struc2vec/src/utils.py", line 30, in restoreVariableFromDisk
    with open(folder_pickles + name + '.pickle', 'rb') as handle:
FileNotFoundError: [Errno 2] No such file or directory: '/home/mailaucq/Documents/book_classification/struc2vec/src/../pickles/weights_distances-layer-4.pickle'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/lib/python3.8/concurrent/futu

struc2vec 17
struc2vec 18
struc2vec 19
struc2vec 20
struc2vec 21
struc2vec 22
struc2vec 23
struc2vec 24
struc2vec 25
struc2vec 26
struc2vec 27


concurrent.futures.process._RemoteTraceback: 
"""
concurrent.futures.process._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/lib/python3.8/concurrent/futures/process.py", line 239, in _process_worker
    r = call_item.fn(*call_item.args, **call_item.kwargs)
  File "/home/mailaucq/Documents/book_classification/struc2vec/src/algorithms_distances.py", line 520, in generate_distances_network_part3
    weights_distances = restoreVariableFromDisk('weights_distances-layer-'+str(layer))
  File "/home/mailaucq/Documents/book_classification/struc2vec/src/utils.py", line 30, in restoreVariableFromDisk
    with open(folder_pickles + name + '.pickle', 'rb') as handle:
FileNotFoundError: [Errno 2] No such file or directory: '/home/mailaucq/Documents/book_classification/struc2vec/src/../pickles/weights_distances-layer-0.pickle'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/lib/python3.8/concurrent/futu

struc2vec 28
struc2vec 29
struc2vec 30
struc2vec 31
struc2vec 32
struc2vec 33
struc2vec 34
struc2vec 35
struc2vec 36
struc2vec 37
struc2vec 38


In [None]:
print("Lenght of features:", X.shape)

In [None]:
len(X[0])

# Training with SVM

In [None]:
from classifierv2 import getClassifier, getClassMetrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split

In [None]:
RANDOM_SEED = 20
FRAC_TRAIN = 0.8

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=(1.0 - FRAC_TRAIN), random_state=RANDOM_SEED)

In [None]:
X_train.shape

In [None]:
X_val.shape

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC(kernel='linear', probability=True)
svc.fit(X_train, y_train)

predicted = svc.predict(X_val)
print(y_val)
print(predicted)
#print accuracy_score
print("Accuracy : " + str(accuracy_score(y_val, predicted)))

print("Micro f-measure " + str(f1_score(y_val, predicted, average='micro')))
