In [1]:
# Import Python libraries
import joblib
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split
#RNN
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
from keras import backend as K

# Library for PEP8 standard
from nbpep8.nbpep8 import pep8

In [2]:
# Define  data
data = pd.read_csv("projet5_to_modelise_final.csv",
                   sep=";", index_col=0,
                   converters={'Title': literal_eval,
                               'Body': literal_eval,
                               'Tags_list': literal_eval})

data.head()

Unnamed: 0_level_0,Title,Body,Tags_list
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3321007,"[android, netbeans]","[net, bean, hand, android, apps, android, sdk,...",[android]
3321011,"[xl, xlsx, excel, file, ruby]","[gem, xl, xlsx, file, spreadsheet, parseexcel,...","[ruby, excel]"
3321029,"[python, inheritance, tree, run, time]","[serialization, deserialization, code, python,...",[python]
3321039,"[android, market, work, developer]","[quetions, license, application, profit, sale,...",[android]
3321082,"[excel, datatable, xml]","[visual, studio, excel, sheet, open, xml, sdk,...",[c#]


In [3]:
# Rename columns
data = data.rename(columns={'Tags_list': 'Tags'})

In [4]:
data["Doc"] = data["Title"] + data["Body"]
data["Doc"].head()

Id
3321007    [android, netbeans, net, bean, hand, android, ...
3321011    [xl, xlsx, excel, file, ruby, gem, xl, xlsx, f...
3321029    [python, inheritance, tree, run, time, seriali...
3321039    [android, market, work, developer, quetions, l...
3321082    [excel, datatable, xml, visual, studio, excel,...
Name: Doc, dtype: object

In [5]:
# Define X and y
X = data["Doc"]
y = data["Tags"]

# Initialize the "CountVectorizer" TFIDF for Doc
vectorizer = TfidfVectorizer(analyzer="word",
                             max_df=.6,
                             min_df=0.005,
                             tokenizer=None,
                             preprocessor=' '.join,
                             stop_words=None,
                             lowercase=False)

vectorizer.fit(X)
X_tfidf = vectorizer.transform(X)

print("Shape of X for Doc: {}".format(X_tfidf.shape))

# Multilabel binarizer for targets
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(y)
y_binarized = multilabel_binarizer.transform(y)

print("Shape of y: {}".format(y_binarized.shape))


Shape of X for Doc: (403252, 506)
Shape of y: (403252, 100)


In [6]:
# Create train and test split (30%)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_binarized,
                                                    test_size=0.3, random_state=8)
print("X_train shape : {}".format(X_train.shape))
print("X_test shape : {}".format(X_test.shape))
print("y_train shape : {}".format(y_train.shape))
print("y_test shape : {}".format(y_test.shape))

X_train shape : (282276, 506)
X_test shape : (120976, 506)
y_train shape : (282276, 100)
y_test shape : (120976, 100)


In [7]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def jaccard_m(y_true, y_pred, smooth=100):
    intersection = K.sum(K.abs(y_true * y_pred), axis=-1)
    sum_ = K.sum(K.abs(y_true) + K.abs(y_pred), axis=-1)
    jac = (intersection + smooth) / (sum_ - intersection + smooth)
    return (1 - jac) * smooth

In [8]:
def build_nn(input_dim, hidden_neurons, output_dim):
    """
    Construct a Keras model which will be used to 
    fit/predict in SKlearn pipeline.
    """
    # Create brain
    model = Sequential()
    model.add(layers.Dense(hidden_neurons,
                           input_dim=input_dim,
                           activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(hidden_neurons,
                           input_dim=input_dim,
                           activation='relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(output_dim,
                           activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', recall_m, precision_m, f1_m, jaccard_m])
    model.summary()
    
    return model

In [9]:
clear_session()

model_params = {
    'input_dim': X_train.shape[1],
    'hidden_neurons': 150,
    'output_dim': y_train.shape[1]}

keras_model = build_nn(**model_params)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 150)               76050     
                                                                 
 dropout (Dropout)           (None, 150)               0         
                                                                 
 dense_1 (Dense)             (None, 150)               22650     
                                                                 
 dropout_1 (Dropout)         (None, 150)               0         
                                                                 
 dense_2 (Dense)             (None, 100)               15100     
                                                                 
Total params: 113,800
Trainable params: 113,800
Non-trainable params: 0
_________________________________________________________________


In [10]:
history = keras_model.fit(X_train.toarray(), y_train,
                          epochs=50,
                          batch_size=256,
                          verbose=0,
                          validation_data=(X_test.toarray(), y_test),
                          shuffle=True)


In [11]:
# Make prediction with Keras Model
y_test_predicted_labels_tfidf_keras = keras_model.predict(X_test.toarray())
y_test_predicted_labels_tfidf_keras = np.where(y_test_predicted_labels_tfidf_keras >= 0.5, 1, 0)

In [12]:
# Inverse transform
y_test_pred_inversed_keras = multilabel_binarizer\
    .inverse_transform(y_test_predicted_labels_tfidf_keras)

y_test_inversed = multilabel_binarizer\
    .inverse_transform(y_test)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_pred_inversed_keras[0:5])
print("True:", y_test_inversed[0:5])


--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [('android',), (), ('haskell',), ('php',), ('android',)]
True: [('android',), ('java',), ('haskell',), ('php',), ('android',)]


In [13]:
joblib.dump(keras_model, 'keras_nlp_model.pkl')
joblib.dump(vectorizer,'tfidf_vectorizer.pkl')
joblib.dump(multilabel_binarizer,'multilabel_binarizer.pkl')

['multilabel_binarizer.pkl']