In [1]:
from network import CNetwork

In [2]:
import pandas as pd
import numpy as np

In [3]:
from utils import verifyDir

In [4]:
dataset_path = "datasetsv2/"
name_dataset = "dataset_1"
length_cut = 30000
random_flag = True
remove_stop_words = True
only_stop_words = False
remove_puntuaction = True

In [5]:
auxiliar_path = 'auxiliar_folder/' + name_dataset   + '/'
verifyDir(auxiliar_path)

In [6]:
df_train = pd.read_csv(dataset_path + "df_train_" + name_dataset + "_" + str(length_cut) + "_" + str(random_flag) + ".csv")

In [7]:
df_val = pd.read_csv(dataset_path + "df_val_" + name_dataset + "_" + str(length_cut) + "_" + str(random_flag) + ".csv")

In [8]:
df_test = pd.read_csv(dataset_path + "df_test_" + name_dataset + "_" + str(length_cut) + "_" + str(random_flag) + ".csv")

In [9]:
df_train.head(5)

Unnamed: 0,label,text,book
0,Bram Stoker,to have anything to do with this . '' `` What ...,The Mystery Of The Sea
1,Allan Poe,"great civility , took me into his study , and ...",The Works Of Edgar Allan Poe4
2,Allan Poe,"impossible , is possible in the organic . _P._...",The Works Of Edgar Allan Poe2
3,Hector Hugh,"was unfortunate , but she reckoned it a lesser...",When William Came
4,Charles Dickens,"the light. -- YOU can be cheerful , Miggs , at...",Barnaby Rudge


In [10]:
from utils.text_processing import get_word_index, get_sequences, get_common_words, pre_process_text

In [11]:
df_train['text'] = df_train['text'].apply(lambda t: pre_process_text(t, remove_stop_words=remove_stop_words, only_stop_words=only_stop_words, remove_puntuaction_flag=remove_puntuaction))

In [12]:
df_test['text'] = df_test['text'].apply(lambda t: pre_process_text(t, remove_stop_words=remove_stop_words, only_stop_words=only_stop_words, remove_puntuaction_flag=remove_puntuaction))

In [13]:
df_val['text'] = df_val['text'].apply(lambda t: pre_process_text(t, remove_stop_words=remove_stop_words, only_stop_words=only_stop_words, remove_puntuaction_flag=remove_puntuaction))

In [14]:
df_train.head(5)

Unnamed: 0,label,text,book
0,Bram Stoker,"[anything, mean, something, tone, query, set, ...",The Mystery Of The Sea
1,Allan Poe,"[great, civility, took, study, gave, clear, ex...",The Works Of Edgar Allan Poe4
2,Allan Poe,"[impossible, possible, organic, P, good, end, ...",The Works Of Edgar Allan Poe2
3,Hector Hugh,"[unfortunate, reckoned, lesser, evil, tearing,...",When William Came
4,Charles Dickens,"[light, cheerful, Miggs, least, ’, Miggs, mome...",Barnaby Rudge


In [15]:
classes = list(df_train['label'])  ## or 'author'
total_classes = list(set(df_train['label']))  ## or author
print("Classes: {}".format(total_classes))
print("Total classes: {}".format(len(total_classes)))
number_books = (df_train[df_train['label'] == total_classes[0]]).shape[0]
print("Total entities for each class in train: {}".format(number_books))
dict_categories = list(set(classes))
dict_categories = {cat: index for index, cat in enumerate(dict_categories)}

Classes: ['George Eliot', 'Thomas Hardy', 'Charles Dickens', 'Hector Hugh', 'Joseph Conrad', 'Pelham Grenville', 'Allan Poe', 'Charles Darwin', 'Jane Austen', 'Daniel Defoe', 'Bram Stoker', 'Mark Twain', 'Arthur Conan Doyle']
Total classes: 13
Total entities for each class in train: 4


In [16]:
y_train = [dict_categories[y] for y in df_train["label"]]

In [17]:
y_val = [dict_categories[y] for y in df_val["label"]]

In [18]:
len(y_train)

46

In [19]:
!pip install networkx



In [21]:
word_index, index_word = get_word_index(df_train["text"], tokenized=True)

In [22]:
len(word_index)

49260

In [23]:
def read_embeddings(fname):
    with open(fname) as f:
        f.readline()
        names = []
        data = []
        for line in f:
            line = line.strip().split(",")
            names.append(line[0])
            data.append([float(v) for v in line[1:]])
        return names, data

In [24]:
def organize_data(graph2label, embed_vnames, embed_vecs):
    N = len(graph2label)
    data = [0 for _ in range(N)]
    labels = [0 for _ in range(N)]
    for i in range(len(graph2label)):
        idx = embed_vnames[i]
        data[int(idx)] = embed_vecs[i]
        labels[int(idx)] = graph2label[idx]
    return np.array(data), np.array(labels)

In [25]:
from utils.igraph2json import igraph2json1
import os

In [33]:
def graph2vec(path, operating_system, networks, labels):
    networks_dir = path + 'dataset/'
    emb_file = path + 'features/nci1.csv'
    graph2label = {}
    try: 
        os.mkdir(in_network)
        os.mkdir(path + 'features/')
    except:
        print("Existe")
    for (i, netw), label in zip(enumerate(networks), labels):
        graph2label[str(i)] = label
        igraph2json1(netw, networks_dir + str(i) + ".json")
    if operating_system == 'linux':
        path_command = 'python ./graph2vec/src/graph2vec.py --input-path ' + networks_dir + ' --output-path ' + emb_file + ' --dimensions ' + str(512)
    os.system(path_command)
    embed_vnames, embed_vecs = read_embeddings(emb_file)
    X, y = organize_data(graph2label, embed_vnames, embed_vecs)
    return X, y


In [34]:
def get_graph2vec_features(sequences, labels):
    network_features = pd.DataFrame()
    nets = []
    for text in sequences:
        obj = CNetwork(text, model=None, index_word=index_word, percentages=None, path=auxiliar_path)
        network = obj.create_network()
        nets.append(network)
    X, y = graph2vec("", "linux", nets, labels)
    print("Shapes {} {}".format(X.shape, y.shape))
    return X, y

In [35]:
X_train, y_train = get_graph2vec_features(get_sequences(df_train["text"], word_index), y_train)

Nodes: 4249 - Edges: 12431
Nodes: 6014 - Edges: 14068
Nodes: 5561 - Edges: 13826
Nodes: 5747 - Edges: 13467
Nodes: 4734 - Edges: 12670
Nodes: 4778 - Edges: 12988
Nodes: 4084 - Edges: 12132
Nodes: 3817 - Edges: 9884
Nodes: 3348 - Edges: 11322
Nodes: 5597 - Edges: 14176
Nodes: 2997 - Edges: 9809
Nodes: 4750 - Edges: 12950
Nodes: 4859 - Edges: 11686
Nodes: 4803 - Edges: 13000
Nodes: 4416 - Edges: 13764
Nodes: 4807 - Edges: 12092
Nodes: 5310 - Edges: 13533
Nodes: 4634 - Edges: 13074
Nodes: 3253 - Edges: 10756
Nodes: 4530 - Edges: 11067
Nodes: 3290 - Edges: 10654
Nodes: 4400 - Edges: 11514
Nodes: 4455 - Edges: 13098
Nodes: 5181 - Edges: 13534
Nodes: 4993 - Edges: 12082
Nodes: 3773 - Edges: 10565
Nodes: 5481 - Edges: 12223
Nodes: 4039 - Edges: 10671
Nodes: 6246 - Edges: 15379
Nodes: 5028 - Edges: 13172
Nodes: 4216 - Edges: 10776
Nodes: 4021 - Edges: 11503
Nodes: 2977 - Edges: 10300
Nodes: 3177 - Edges: 10404
Nodes: 4303 - Edges: 13814
Nodes: 3231 - Edges: 10715
Nodes: 4423 - Edges: 11584
Nod

  0%|          | 0/46 [00:00<?, ?it/s]


Feature extraction started.



100%|██████████| 46/46 [00:01<00:00, 42.67it/s]



Optimization started.

Shapes (46, 512) (46,)


In [36]:
X_train.shape

(46, 512)

In [37]:
X_val, y_val = get_graph2vec_features(get_sequences(df_val["text"], word_index), y_val)

Nodes: 2910 - Edges: 11405
Nodes: 4251 - Edges: 13487
Nodes: 3283 - Edges: 9711
Nodes: 4026 - Edges: 10217
Nodes: 2903 - Edges: 10356
Nodes: 4609 - Edges: 12662
Nodes: 4129 - Edges: 10839
Nodes: 4292 - Edges: 11318
Nodes: 2754 - Edges: 10556
Nodes: 4080 - Edges: 10849
Nodes: 4579 - Edges: 12003
Nodes: 3796 - Edges: 12248
Nodes: 3712 - Edges: 10204
Nodes: 2498 - Edges: 8944
Nodes: 3305 - Edges: 9206
Nodes: 4674 - Edges: 11715
Existe


  0%|          | 0/46 [00:00<?, ?it/s]


Feature extraction started.



100%|██████████| 46/46 [00:01<00:00, 32.22it/s]


Optimization started.






Shapes (16, 512) (16,)


# Training with SVM

In [38]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC, LinearSVC

In [39]:
params = {'C':[0.01,0.1,1,10,100,1000]}

classifier = GridSearchCV(LinearSVC(), params, cv=3, scoring='f1')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)
score = accuracy_score(y_val, y_pred)
print(y_val)
print(y_pred)
print("Score {}".format(score))
print("Best params {}".format(classifier.best_params_))
print(classification_report(y_val, y_pred))

Traceback (most recent call last):
  File "/home/mailaucq/Documents/book_classification/venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/mailaucq/Documents/book_classification/venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "/home/mailaucq/Documents/book_classification/venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "/home/mailaucq/Documents/book_classification/venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/mailaucq/Documents/book_classification/venv/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 1068, in f1_score
    return fbeta_score(y_true, y_pred, beta=1, labels=la

[ 7  6  5 12  8 11  0  4  7  1  2 10  5  9  2  3]
[ 0  5  0  0  0  5  5  0  5  5 10  5  3  5  0  1]
Score 0.0
Best params {'C': 0.01}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       1.0
           2       0.00      0.00      0.00       2.0
           3       0.00      0.00      0.00       1.0
           4       0.00      0.00      0.00       1.0
           5       0.00      0.00      0.00       2.0
           6       0.00      0.00      0.00       1.0
           7       0.00      0.00      0.00       2.0
           8       0.00      0.00      0.00       1.0
           9       0.00      0.00      0.00       1.0
          10       0.00      0.00      0.00       1.0
          11       0.00      0.00      0.00       1.0
          12       0.00      0.00      0.00       1.0

    accuracy                           0.00      16.0
   macro avg       0.00      0.00      0.00      16.0


Traceback (most recent call last):
  File "/home/mailaucq/Documents/book_classification/venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/mailaucq/Documents/book_classification/venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "/home/mailaucq/Documents/book_classification/venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "/home/mailaucq/Documents/book_classification/venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/mailaucq/Documents/book_classification/venv/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 1068, in f1_score
    return fbeta_score(y_true, y_pred, beta=1, labels=la