In [1]:
from network import CNetwork

In [2]:
import pandas as pd
import numpy as np

In [3]:
import os

In [4]:
name_dataset = "dataset_1"
dataset_path = "datasetsv2/"
length_cut = 30000
random_flag = True
remove_punctuation = True
lemmatization_flag = True
dimension = 8
feature_selection = 'common_words'

In [5]:
df = pd.read_csv(dataset_path + name_dataset + ".csv")

In [6]:
df.head(5)

Unnamed: 0,label,text,book
0,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Christine Gehring,...",Right Ho Jeeves
1,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Suzanne L. Shell, ...",Tales Of St Austin
2,Joseph Conrad,And that last\n\nword was the single word of ...,Victory
3,Joseph Conrad,\n\n\n\nIf I have ever had these gifts in any ...,Under Western Eyes
4,Bram Stoker,\n\n\n\n Lond...,The Lady Of The Shroud


In [7]:
from utils.text_processing import get_min_len_corpus

In [8]:
print("Min Length:", get_min_len_corpus(list(df["text"])))

Min Length: 55024


In [9]:
from utils.text_processing import get_corpus, get_random_corpus

In [10]:
texts = list(df['text'])

In [11]:
corpus, segmented_corpus = get_corpus(texts, length_cut)

In [12]:
selected_corpus, words_features, word_index, index_word = get_random_corpus(segmented_corpus, remove_punctuation=remove_punctuation, lemmatization_flag=lemmatization_flag, feature_selection = feature_selection)

In [13]:
len(selected_corpus)

78

In [14]:
labels = list(df['label'])

In [15]:
total_classes = list(set(labels))  ## or author
print("Total classes: {}".format(len(total_classes)))
number_books = (df[df['label'] == total_classes[0]]).shape[0]
print("Total entities for each class in train: {}".format(number_books))
dict_categories = {cat: index for index, cat in enumerate(total_classes)}

Total classes: 13
Total entities for each class in train: 6


In [16]:
y = [dict_categories[y] for y in labels]

In [17]:
total_classes

['Hector Hugh',
 'Bram Stoker',
 'Charles Dickens',
 'Daniel Defoe',
 'Pelham Grenville',
 'Arthur Conan Doyle',
 'Joseph Conrad',
 'Thomas Hardy',
 'George Eliot',
 'Charles Darwin',
 'Allan Poe',
 'Jane Austen',
 'Mark Twain']

In [18]:
import json
def graph2vec(operating_system, networks):
    input_path = "graph2vec/dataset/"+str(length_cut)+"_"+str(dimension)+"/"
    output_path = 'graph2vec/features/'
    try: 
        os.mkdir(input_path)
        os.mkdir(output_path)
    except:
        print("Existe")
    for i, netw in enumerate(networks):
        with open((input_path+str(i)+".json"), "w") as f:
            edges = []
            v_names = netw.vs["name"]
            for edge in netw.get_edgelist():
                (u,v) = edge
                edges.append([int(v_names[u]), int(v_names[v])])
            json.dump({"edges":edges}, f)
    if operating_system == 'linux':
        #path_command = "python struc2vec/src/main.py --dimensions "+str(dimension)+" --input "+input_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".edgelist --output "+output_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".emb"
        path_command = 'python graph2vec/src/graph2vec.py --input-path ' + input_path + ' --output-path ' + output_path+str(length_cut)+'_'+str(dimension)+'.csv' + ' --dimensions ' + str(dimension)
        os.system(path_command)
        print("graph2vec", str(i))

In [19]:
def read_graph2vec(output_path, networks):
    all_network_features = [[] for _ in range(len(networks))]
    with open(output_path+str(length_cut)+"_"+str(dimension)+".csv", "r") as f:
        lines = f.readlines()
        for l in lines[1:]:
            row = l.strip().split(",")
            id_graph = row[0]
            print("id graph", id_graph)
            network_features = np.array([float(v) for v in row[1:]])
            print(str(id_graph), "len",len(network_features))
            all_network_features[int(id_graph)] = network_features
    return np.array(all_network_features)

In [20]:
def get_graph2vec_features(sequences, index_word):
    nets = []
    for text in sequences:
        obj = CNetwork(text, model=None, index_word=index_word, percentages=None, path="")
        network = obj.create_network()
        nets.append(network)
    graph2vec("linux", nets)
    X = read_graph2vec('graph2vec/features/', nets)
    return X

In [21]:
X = get_graph2vec_features(selected_corpus, index_word)

Nodes: 4046 - Edges: 15526
Nodes: 4065 - Edges: 16233
Nodes: 3859 - Edges: 16324
Nodes: 4245 - Edges: 17093
Nodes: 3780 - Edges: 15951
Nodes: 3807 - Edges: 15686
Nodes: 3525 - Edges: 15416
Nodes: 3794 - Edges: 16344
Nodes: 3499 - Edges: 15768
Nodes: 4767 - Edges: 17413
Nodes: 4138 - Edges: 16691
Nodes: 2986 - Edges: 14597
Nodes: 5233 - Edges: 18293
Nodes: 2946 - Edges: 14694
Nodes: 4530 - Edges: 16855
Nodes: 2520 - Edges: 13121
Nodes: 4174 - Edges: 16217
Nodes: 4875 - Edges: 17300
Nodes: 4077 - Edges: 15956
Nodes: 4031 - Edges: 16776
Nodes: 5258 - Edges: 17610
Nodes: 3683 - Edges: 16053
Nodes: 3585 - Edges: 16033
Nodes: 4050 - Edges: 16512
Nodes: 3075 - Edges: 13800
Nodes: 4254 - Edges: 16489
Nodes: 4587 - Edges: 17522
Nodes: 3858 - Edges: 16189
Nodes: 4244 - Edges: 17261
Nodes: 2947 - Edges: 9434
Nodes: 3119 - Edges: 14200
Nodes: 4425 - Edges: 16980
Nodes: 4073 - Edges: 15055
Nodes: 3293 - Edges: 15963
Nodes: 2472 - Edges: 13107
Nodes: 3277 - Edges: 15844
Nodes: 3720 - Edges: 15937
No

  0%|          | 0/78 [00:00<?, ?it/s]


Feature extraction started.



100%|██████████| 78/78 [01:31<00:00,  1.17s/it]



Optimization started.

GRAPHHHHH Graph with 3807 nodes and 15686 edges
GRAPHHHHH Graph with 3683 nodes and 16053 edges
GRAPHHHHH Graph with 3720 nodes and 15937 edges
GRAPHHHHH Graph with 3781 nodes and 15927 edges
GRAPHHHHH Graph with 4875 nodes and 17300 edges
GRAPHHHHH Graph with 3000 nodes and 14426 edges
GRAPHHHHH Graph with 4852 nodes and 17856 edges
GRAPHHHHH Graph with 4244 nodes and 17261 edges
GRAPHHHHH Graph with 4229 nodes and 16670 edges
GRAPHHHHH Graph with 5055 nodes and 17800 edges
GRAPHHHHH Graph with 4671 nodes and 17959 edges
GRAPHHHHH Graph with 3859 nodes and 16324 edges
GRAPHHHHH Graph with 3726 nodes and 15329 edges
GRAPHHHHH Graph with 3119 nodes and 14200 edges
GRAPHHHHH Graph with 4425 nodes and 16980 edges
GRAPHHHHH Graph with 2472 nodes and 13107 edges
GRAPHHHHH Graph with 5121 nodes and 17229 edges
GRAPHHHHH Graph with 3188 nodes and 15373 edges
GRAPHHHHH Graph with 3293 nodes and 15963 edges
GRAPHHHHH Graph with 2986 nodes and 14597 edges
GRAPHHHHH Graph 

In [22]:
print("Lenght of features:", X.shape)

Lenght of features: (78, 8)


In [23]:
X

array([[-4.11698961e+00, -1.04746723e+00,  2.53941011e+00,
        -1.67708051e+00,  5.60944825e-02,  5.17310858e-01,
         6.69491005e+00,  4.72008610e+00],
       [-4.89869976e+00, -1.63956177e+00, -2.37164307e+00,
        -1.07490039e+00, -2.18172431e+00, -4.82287979e+00,
         1.07222688e+00,  7.62050152e-02],
       [-4.78849077e+00,  1.39343953e+00,  2.05366588e+00,
        -6.29728079e-01, -3.10257769e+00, -1.64375019e+00,
         1.51932406e+00,  3.62211132e+00],
       [ 1.44875181e+00, -3.23300719e+00,  1.66709495e+00,
         3.92010975e+00,  1.19563270e+00, -5.15232754e+00,
         3.38269520e+00,  1.05949950e+00],
       [ 4.61086631e-01, -1.73655510e+00,  1.82776880e+00,
        -1.50175643e+00,  2.32726619e-01, -3.94096041e+00,
         5.75821352e+00, -1.65222347e+00],
       [ 2.20628715e+00, -4.85130692e+00,  5.04198790e+00,
        -2.24335551e+00,  3.98797402e-03, -4.58134085e-01,
         3.98741412e+00,  1.42331719e+00],
       [ 6.89999819e-01, -2.426469

In [24]:
len(X[0])

8

In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)

In [26]:
#X = scaler.fit_transform(X)

# Classification

In [27]:
import classifierv2

In [28]:
#obj = classifierv2.Classification(X, y)
#scores = obj.classification()

# Score 0.0625
Score 0.125
Score 0.0625
Score 0.0625

1000
Score 0.0625
Score 0.0625
Score 0.25
Score 0.0

10000
Score 0.0625
Score 0.1875
Score 0.125
Score 0.0625

Score 0.0
Score 0.0
Score 0.0
Score 0.0625

In [29]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from tensorflow.keras.optimizers import SGD

2021-09-17 19:53:42.067977: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-17 19:53:42.068211: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [31]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [32]:
y_train = keras.utils.np_utils.to_categorical(y_train, num_classes = 13)
y_test = keras.utils.np_utils.to_categorical(y_test, num_classes = 13)


In [33]:


print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)



(62, 8)
(16, 8)
(62, 13)
(16, 13)


In [85]:
model = Sequential()
model.add(Dense(50, activation='relu', input_dim=8))
model.add(Dense(13, activation='softmax'))
sgd = SGD(lr = 0.01, decay = 1e-6, momentum = 0.789, nesterov=True)
model.compile(optimizer=sgd,
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [86]:
history = model.fit(X_train, y_train, epochs = 10, batch_size = 5)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [87]:
score = model.evaluate(X_test, y_test, verbose = 0)

In [88]:


print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 3.3343286514282227
Test accuracy: 0.125
