In [2]:
import pandas as pd
import numpy as np
import random as rd
from cache import Cache

cache=Cache()

In [3]:
df = cache.load('clean_df_complete')

In [4]:
df['title_len'] = df['title'].apply(lambda x : len(x))
df['description_len'] = df['description'].apply(lambda x : len(x))
df['content_len'] = df['content'].apply(lambda x : len(x))

In [5]:
from statistics import mean 
print('mean title : ',mean(df['title_len']),
'\nmean description : ',mean(df['description_len']),
'\nmean content : ',mean(df['content_len']))

print('\nratio : ',mean(df['content_len'])/mean(df['description_len']),' and ',mean(df['content_len'])/mean(df['title_len']))

mean title :  41.92821651227684 
mean description :  156.9825001815334 
mean content :  2638.949834545285

ratio :  16.81047143148837  and  62.939711107735384


# Building training and testing data

In [6]:
labels  = cache.load('labels_96k')
content_lsa = cache.load('feature_content_lsa_96k')
description_lsa = cache.load('feature_description_lsa_96k')
title_lsa = cache.load('feature_title_lsa_96k')
content_w2v = cache.load('feature_content_w2v_96k')
title_w2v = cache.load('feature_title_w2v_96k')
description_w2v = cache.load('feature_description_w2v_96k')


In [11]:
XY = []
for i in range(len(labels)):
    temp = [item for item in np.concatenate((content_lsa[i],description_lsa[i],title_lsa[i],content_w2v[i],title_w2v[i],description_w2v[i],labels['thematic_value'][i]),axis=None)]
    if len(temp)==112:
        XY+=[temp]
XY = np.array(XY)

(95495, 112)

In [13]:
np.random.seed(10000)
np.random.shuffle(XY)

X = XY[:,:-1]
Y = XY[:,-1]

In [14]:
split = (len(X)*8)//10

X_train = X[:split,:]
X_test = X[split+1:,:]
Y_train = Y[:split]
Y_test = Y[split+1:]

# Random forest

In [20]:
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.ensemble import RandomForestClassifier as rfc

In [21]:
forest = rfc(n_estimators=200, max_depth=5,random_state=0)
forest.fit(X_train, Y_train)
forest.score(X_test, Y_test)

0.4921981359304639

In [31]:
forests = {}
scores = []
for i in range(11):
    
    Y_train_temp = np.array([int(item==i) for item in Y_train])
    Y_test_temp = np.array([int(item==i) for item in Y_test])
    
    
    forests[i] = rfc(n_estimators=200, max_depth=5,random_state=0)
    forests[i].fit(X_train, Y_train_temp)
    scores.append(forests[i].score(X_test, Y_test_temp))
    print('le score pour la classe ' + str(i) + ' est de ' + str(scores[i]) + ' %')


le score pour la classe 0 est de 0.9042831710126715 %
le score pour la classe 1 est de 0.8988899361189653 %
le score pour la classe 2 est de 0.9369567493978427 %
le score pour la classe 3 est de 0.8756937899256466 %
le score pour la classe 4 est de 0.9104094669598911 %
le score pour la classe 5 est de 0.9224526128390408 %
le score pour la classe 6 est de 0.9317205990156038 %
le score pour la classe 7 est de 0.9360142423290397 %
le score pour la classe 8 est de 0.9582678814535553 %
le score pour la classe 9 est de 0.9452822285056027 %
le score pour la classe 10 est de 0.9451775054979579 %


# Keras NN

In [17]:
import tensorflow
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import activations
from tensorflow.keras.callbacks import TensorBoard
import time
from time import clock
from tensorflow.keras.utils import normalize, to_categorical
import pandas as pd
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [18]:
Y_cat = to_categorical(Y)
Y_train_cat = Y_cat[:split,:]
Y_test_cat = Y_cat[split+1:,:]


In [None]:
Y_cat.shape

In [19]:
name=f'project-{int(time.time())}'


model=Sequential()

model.add(Dense(32, activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(256,activation='relu'))

model.add(Dense(11,activation='softmax'))

ada = keras.optimizers.Adagrad()

model.compile(loss='categorical_crossentropy',
              optimizer=ada,
              metrics=["accuracy"])
model.fit(X_train, Y_train_cat, batch_size=32, epochs=20, validation_data=(X_test, Y_test_cat))

Train on 76396 samples, validate on 19098 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1fdd85343c8>