In [1]:
from network import CNetwork

In [2]:
import pandas as pd
import numpy as np

In [3]:
from utils import verifyDir

In [4]:
name_dataset = "dataset_1"
dataset_path = "datasetsv2/"
length_cut = 10000
random_flag = True
measures = ["dgr_n"]

In [5]:
auxiliar_path = 'auxiliar_folder/' + name_dataset   + '/'
verifyDir(auxiliar_path)

In [6]:
df = pd.read_csv(dataset_path + name_dataset + ".csv")

In [7]:
df.head(5)

Unnamed: 0,label,text,book
0,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Christine Gehring,...",Right Ho Jeeves
1,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Suzanne L. Shell, ...",Tales Of St Austin
2,Joseph Conrad,And that last\n\nword was the single word of ...,Victory
3,Joseph Conrad,\n\n\n\nIf I have ever had these gifts in any ...,Under Western Eyes
4,Bram Stoker,\n\n\n\n Lond...,The Lady Of The Shroud


In [8]:
from utils.text_processing import get_min_len_corpus

In [9]:
print("Min Length:", get_min_len_corpus(list(df["text"])))

Min Length: 47826


In [10]:
from utils.text_processing import get_corpus, get_random_corpus

In [11]:
texts = list(df['text'])

In [12]:
corpus, segmented_corpus = get_corpus(texts, length_cut)

In [13]:
from utils.text_processing import remove_puntuaction

In [14]:
segmented_corpus = [[[remove_puntuaction(w) for w in t] for t in p] for p in segmented_corpus ]

In [15]:
selected_corpus, words_features, word_index, index_word = get_random_corpus(segmented_corpus)

In [16]:
len(selected_corpus)

78

In [17]:
labels = list(df['label'])

In [18]:
total_classes = list(set(labels))  ## or author
print("Total classes: {}".format(len(total_classes)))
number_books = (df[df['label'] == total_classes[0]]).shape[0]
print("Total entities for each class in train: {}".format(number_books))
dict_categories = {cat: index for index, cat in enumerate(total_classes)}

Total classes: 13
Total entities for each class in train: 6


In [19]:
y = [dict_categories[y] for y in labels]

In [20]:
total_classes

['Thomas Hardy',
 'George Eliot',
 'Allan Poe',
 'Hector Hugh',
 'Charles Dickens',
 'Charles Darwin',
 'Jane Austen',
 'Pelham Grenville',
 'Arthur Conan Doyle',
 'Joseph Conrad',
 'Daniel Defoe',
 'Bram Stoker',
 'Mark Twain']

In [21]:
def get_local_features(sequences, word_features, measures):
    network_features = pd.DataFrame()
    all_features_container = []
    for text in sequences:
        obj = CNetwork(text, model=None, index_word=None, percentages=None, path=auxiliar_path)
        network = obj.create_network()
        local_measure = obj.get_network_measures(network, word_features, measures)
        all_features_container.append(local_measure)
        #a_series = pd.Series(local_measure)
        #network_features = network_features.append(a_series, ignore_index=True)
        #network_features = np.vstack([network_features, global_measure]) if len(network_features) > 0 else global_measure
    return all_features_container

In [22]:
X = get_local_features(selected_corpus, words_features, measures)

Nodes: 2507 - Edges: 7489
Len features: 90
[ 41.45        44.625       71.89473684  37.30769231  32.
  82.4         64.75555556  65.18032787 107.04        49.16216216
  46.67741935  49.65384615  62.11111111  64.8         72.20689655
  93.13333333  50.21212121  77.6         51.77777778  39.90625
  84.33333333  42.25641026  21.6         84.66666667  36.1025641
  60.21875     22.90322581  59.02439024  41.16949153  65.35294118
  14.20062696  95.75        14.86549708  13.41489362  40.52755906
  94.88888889  37.72727273  88.05882353  68.4375      77.83333333
  75.03448276  44.08333333  42.4047619   51.86111111  67.60416667
  42.05084746  66.58823529  61.7037037   15.19047619  36.16666667
  22.33333333  51.43589744  35.7037037   40.68493151  26.77777778
  66.36363636  37.36781609  52.04166667  25.04444444  30.91623037
  34.         107.76190476  28.91        87.42424242  38.109375
  71.9047619   41.07142857  16.42941176  56.77777778  10.66666667
  38.79310345  42.17391304  17.88424437  38.367

Nodes: 2794 - Edges: 7825
Len features: 90
[ 61.34782609  62.26470588  51.94202899  28.74489796  44.5
  45.73333333  67.57446809  88.53846154 127.27272727  34.53409091
  43.18181818 110.64705882  38.1372549   45.9         30.
  84.93103448  35.69811321  50.4516129   24.5         75.4
  87.9         38.1147541   43.          59.47058824  54.87878788
  71.16        29.34782609  67.65        29.10638298  72.35483871
  16.78595318  27.88888889  33.25        10.92176039  38.14049587
  56.          21.          44.25925926  81.25        95.1
  90.13333333  39.4047619   54.53333333  77.06060606  79.09375
  49.2739726   90.7         92.07692308  15.85074627  58.48571429
  42.61538462  53.20689655  28.23125     32.95        59.72727273
  97.125       48.70175439  85.05263158  18.45        33.66878981
 115.88235294  92.26086957  35.4875      82.54545455  51.64705882
  70.78947368  41.04347826  16.66031746 115.75        92.375
 123.          58.82142857  14.7246696   61.41666667  13.60377358
  42

Nodes: 2501 - Edges: 7452
Len features: 90
[ 54.12820513  48.          65.98076923  35.37974684  42.82857143
   8.18181818  69.41463415  61.84375    111.93333333  54.14285714
  52.55555556  76.57692308  39.40659341  59.58333333  87.9375
 128.          43.24675325  55.85714286  45.75       134.5
  89.27777778  51.3255814   87.88        99.92307692  39.79411765
  65.87096774  28.95238095  41.58181818  46.16949153  69.41935484
  17.25735294  31.75        25.2278481   14.26623377  37.94915254
  88.67741935  43.33333333  81.75        40.46666667 114.69230769
  77.          44.41666667  42.22222222  65.07692308  80.61764706
  36.703125    97.15        46.04166667  27.59183673  55.1025641
  25.66666667  71.37037037  29.03846154  38.06521739  57.86363636
 104.28571429  40.671875    39.13636364  21.125       28.9039548
  71.31818182  67.07692308  31.68518519  78.41176471  50.10344828
  97.14285714  43.25        16.05362776 107.4         79.27272727
  39.74285714  83.26086957  14.13877551  38.2 

Nodes: 2828 - Edges: 7871
Len features: 90
[ 39.65306122  52.8         45.52631579  46.68421053  31.36842105
  14.          51.46808511  56.92105263 111.33333333  37.57142857
  39.94117647  45.16901408  63.80392157  71.47058824  45.7
 143.23076923  31.25641026  37.6         34.25        76.35294118
 114.72222222  55.19354839  37.25        57.6875      62.14285714
  94.57692308  43.18181818  38.24719101  50.05970149  36.21428571
  16.43959732  24.90909091  28.33093525  14.53581662  32.05755396
  99.96153846  42.20408163  70.          86.92857143 119.125
  67.46808511  67.08695652  15.73913043  73.          84.41666667
  29.74285714  80.33333333  54.95454545  31.55813953  70.35714286
  86.33333333  73.40909091  29.53216374  72.16666667  69.38461538
  97.14285714  35.48958333  78.2         30.22435897  29.88043478
  92.84615385  46.38095238  22.15923567  60.51851852  51.25862069
 137.5         71.5         16.62244898 184.5         34.5
  41.7         86.05        13.52815534  47.17142857

In [23]:
print("Lenght of features:", len(X[0]))

Lenght of features: 90


## Exploratory data analysis

In [24]:
#df_data = X_train.copy()

In [25]:
#df_data["label"] = y_train

In [26]:
#import seaborn as sns
#import matplotlib.pyplot as plt

In [27]:
#plt.close()
#sns.set_style("whitegrid")
#sns.pairplot(df_data, hue="label", height=3)
#plt.show()

# Normalize data

In [28]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)

In [29]:
X = scaler.fit_transform(X)

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
RANDOM_SEED = 20
FRAC_TRAIN = 0.8

In [32]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=(1.0 - FRAC_TRAIN), random_state=RANDOM_SEED)

# Training with Bayes

In [33]:
from classifierv2 import getClassifier, getClassMetrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

In [34]:
from sklearn.neighbors import KNeighborsClassifier

In [35]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

predicted = knn_clf.predict(X_val)
print(y_val)
print(predicted)
#print accuracy_score
print("Accuracy : " + str(accuracy_score(y_val, predicted)))

print("Micro f-measure " + str(f1_score(y_val, predicted, average='micro')))

[7, 9, 10, 0, 12, 11, 5, 11, 2, 1, 6, 8, 10, 3, 4, 3]
[ 7  6 10  0  6  6  6  0  0  6  1  0 10  4  0  6]
Accuracy : 0.25
Micro f-measure 0.25


# Training with SVM

In [36]:
from sklearn.svm import SVC

In [37]:
svc = SVC(kernel='linear', probability=True)
svc.fit(X_train, y_train)

predicted = svc.predict(X_val)
print(y_val)
print(predicted)
#print accuracy_score
print("Accuracy : " + str(accuracy_score(y_val, predicted)))

print("Micro f-measure " + str(f1_score(y_val, predicted, average='micro')))

[7, 9, 10, 0, 12, 11, 5, 11, 2, 1, 6, 8, 10, 3, 4, 3]
[ 7  9 10  0  9  0  5  8  2  7  1  1 10  4  0  1]
Accuracy : 0.4375
Micro f-measure 0.4375
