In [1]:
import pandas as pd
import numpy as np
import tensorflow
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv(r"D:\Datasets\NLP_text_mining\train.csv")

In [3]:
df.toxic.value_counts() #classe sbilanciata 0 non toxic 1 toxic (10)

0    144277
1     15294
Name: toxic, dtype: int64

# Preprocessing the text:

In [4]:
import re
def remove_link(x):
    x = re.sub(r"http\S+", "", x)
    return x
def remove_emoji(x):
    x = re.sub(r"(\<u+\S*>)", "", x)
    return x

In [5]:
df.rename(columns = {"comment_text":"text"}, inplace = True)

In [6]:
df["text"] = df.text.str.lower()
df.text = df.text.apply(remove_emoji)
df["text"] = df.text.apply(remove_link)
df.text = df.text.replace("#\w+","", regex = True) #rimozione hashtags
df.text = df.text.replace("@[^\s:]+", "", regex = True) #rimozione mentions
df.text = df.text.replace("\n", " ", regex = True)
df.text = df.text.replace("-", "", regex = True)
df.text = df.text.replace("_", "", regex = True) 
df.text = df.text.replace("\d*","",regex = True)
df.text = df.text.replace('\s+', ' ', regex=True) #rimuovere più spazi bianchi

In [7]:
contractions = {
"ain't": "are not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "I would",
"i'd've": "I would have",
"i'll": "I will",
"i'll've": "I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there had",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"}

In [8]:
def remove_contraction(text):
    for word in text.split():
        if word.lower() in contractions:
            text = text.replace(word, contractions[word.lower()])
    return text

In [9]:
df.text = df.text.apply(remove_contraction)

In [10]:
df.text = df.text.replace('[^\w\s]', ' ', regex = True)

In [11]:
df.text = df.text.replace('\s+', ' ', regex=True) #rimuovere più spazi bianchi

In [12]:
import spacy
nlp = spacy.load("en_core_web_sm") #per l'italiano
def map_nlp(x):
    x = nlp(x)
    return x
stopwords = nlp.Defaults.stop_words

In [13]:
other_stop = ["l","c","i","eh","fr","e","no","s","o","a","u","so", "b","ü","nah", "n","s","nd","v",
             "th","alll","boo","bo","d","lol","ah","ahh","alll","tã","ur","ah","la","hi","mmm"]

In [14]:
for i in other_stop:
    stopwords.add(i)
stopwords.update(other_stop)

for stopword in stopwords:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True
    
def remove_stopwords(x):
    x = [token for token in x if not token.is_stop] #parole lemmatizzate
    return x
def pos(x):
    x = [token.pos_ for token in x]
    return x
def ner(x):
    x = [token.label_ for token in x.ents]
    return x

In [15]:
df["Nlp_text"] = df.text.apply(map_nlp)

In [None]:
df["Nlp_text_nostopw"] = df["Nlp_text"].apply(remove_stopwords)

In [None]:
df["Pos_text"] = df["Nlp_text"].apply(pos)

In [None]:
df["Ner_text"] = df["Nlp_text"].apply(ner)

In [None]:
def remove_comma(x):
    filtered = [i for i in x if i.strip()]
    return filtered

In [None]:
df["Nlp_text_nostopw"] = df["Nlp_text_nostopw"].apply(remove_comma)

In [None]:
#df.to_csv(r"D:\Datasets\NLP_text_mining\preprocessed_text_toxic.csv")

In [2]:
df = pd.read_csv(r"D:\Datasets\NLP_text_mining\preprocessed_text_toxic.csv")

In [3]:
df.text = df.text.replace("\d*","",regex = True)

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=100000)
tokenizer.fit_on_texts(df['text'].astype(str))
sequences = tokenizer.texts_to_sequences(df['text'].astype(str))

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=30)

labels = to_categorical(np.asarray(df.toxic))

Using TensorFlow backend.


Found 184883 unique tokens.


In [5]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (159571, 30)
Shape of label tensor: (159571, 2)


In [6]:
# from sklearn.utils import shuffle
# data, labels = shuffle(data, labels, random_state=1)

In [7]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(0.20 * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [8]:
# from sklearn.model_selection import train_test_split
# x_train,x_test, y_train, y_test = train_test_split(data, labels, stratify = labels, test_size = 0.33)

In [9]:
from urllib.request import urlopen
import gzip
#Fast text
# get the vectors
file = gzip.open(urlopen('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz'))

In [10]:
# load the whole embedding into memory
embeddings_index = dict()
f = open("D:/Datasets/embeddings/glove.6B.200d.txt", encoding="utf8")
lista = []
for line in f:
    values = line.split()
    word = values[0]
    lista.append(word)
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [11]:
embedding_matrix = np.zeros((len(word_index) + 1, 200))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

##### Dati sbilanciati 10% toxic comment 90% no toxic comment

In [12]:
from tensorflow.keras.models import Sequential

In [13]:
from sklearn.utils import class_weight
sample_weights = class_weight.compute_sample_weight('balanced', y_train)

In [14]:
import tensorflow.keras as ks
from tensorflow.keras.optimizers import SGD
opt = ks.optimizers.Nadam(lr=0.001, beta_1=0.9, beta_2=0.99)
opt2 = SGD(lr=0.001, momentum=0.9, nesterov=True, decay=0.001) # lr = 0.01, decay=0.0001
opt3 = ks.optimizers.RMSprop(lr=0.1, rho=0.9)
opt4 = ks.optimizers.Adagrad(lr=0.01)
opt5 = ks.optimizers.Adadelta(lr= 1.0, rho=0.95)
opt6 = ks.optimizers.Adam(lr=0.001, beta_1=0.6, beta_2=0.999, amsgrad=False)
opt7 = ks.optimizers.Adamax(lr=0.002, beta_1=0.9, beta_2=0.999)

In [15]:
from tensorflow.keras.layers import Flatten
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.initializers import Constant
keras_model = Sequential()
embedding_layer = Embedding(len(word_index)+1, 200,weights=[embedding_matrix],
                           input_length=30, trainable = False)
keras_model.add(embedding_layer)
keras_model.add(Conv1D(50, 32, activation='relu', padding='same', strides=1,kernel_initializer='glorot_normal'))
keras_model.add(MaxPool1D())
keras_model.add(Conv1D(50, 16, activation='relu', padding='same', strides=1,kernel_initializer='glorot_normal'))
keras_model.add(MaxPool1D(pool_size=1))
keras_model.add(Dropout(0.3))
keras_model.add(Conv1D(16, 8, activation='relu', padding='same', strides=1,kernel_initializer='glorot_normal'))
keras_model.add(MaxPool1D(pool_size=4))
keras_model.add(Flatten())
keras_model.add(Dense(64, activation = "relu"))
keras_model.add(Dense(32, activation = "relu")) #kernel_regularizer = l1_l2(l1=0.1, l2=0.01)
keras_model.add(Dropout(0.5))
keras_model.add(Dense(2, activation = "softmax"))
keras_model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer=opt)

In [16]:
keras_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 200)           36976800  
_________________________________________________________________
conv1d (Conv1D)              (None, 30, 50)            320050    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 15, 50)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 15, 50)            40050     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 15, 50)            0         
_________________________________________________________________
dropout (Dropout)            (None, 15, 50)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 15, 16)            6

In [17]:
# from tensorflow.keras.layers import Input
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import MaxPooling1D, Dropout
# sequence_input = Input(30, dtype='int32')
# embedding_layer = Embedding(len(word_index)+1, 200,weights=[embedding_matrix],
#                            input_length=30, trainable = False)(sequence_input)
# x = Conv1D(50, 32, activation='relu', padding='same', strides=1,kernel_initializer='uniform', 
#            kernel_regularizer = l1_l2(l1=0.1, l2=0.01))(embedding_layer)
# x = MaxPooling1D()(x)
# x = Conv1D(50, 16, activation='relu', padding='same', strides=1,kernel_initializer='uniform')(x)
# x = MaxPooling1D(2)(x)
# x = Conv1D(16, 8, activation='relu', padding='same', strides=1,kernel_initializer='uniform')(x)
# x = MaxPooling1D(4)(x)  # global max pooling
# x = Flatten()(x)
# x = Dense(128, activation='relu')(x)
# preds = Dense(2, activation='softmax')(x)

# model = Model(sequence_input, preds)
# model.compile(loss='sparse_categorical_crossentropy',
#               optimizer=opt,
#               metrics=['acc'])

# # happy learning!
# # model.fit(x_train, y_train, epochs=2, batch_size=128) #validation_data=(x_test, y_val)

In [18]:
# from imblearn.keras import BalancedBatchGenerator
# from imblearn.under_sampling import NearMiss
# training_generator = BalancedBatchGenerator(
#     x_train, y_train.argmax(axis = -1), sampler=NearMiss(), batch_size=256)
# keras_model.fit_generator(generator=training_generator,
#                                        epochs=10, verbose=1)

In [19]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
train_img_pro = ImageDataGenerator()

In [21]:
!pip install imblearn

Collecting imblearn
  Using cached https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl
Collecting imbalanced-learn
  Using cached https://files.pythonhosted.org/packages/eb/aa/eba717a14df36f0b6f000ebfaf24c3189cd7987130f66cc3513efead8c2a/imbalanced_learn-0.6.1-py3-none-any.whl
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.6.1 imblearn-0.0


In [22]:
from imblearn.under_sampling import RandomUnderSampler
def balanced_flow_from_directory(flow_from_directory):
    for immagini, classi in flow_from_directory:
         yield custom_balance(immagini.reshape(immagini.shape[0],immagini.shape[1]),classi)
            
def custom_balance(X, y):
    rus = RandomUnderSampler()
    X_resampled, y_resampled = rus.fit_sample(X, y.argmax(axis = -1))     
    #X_resampled, y_resampled = shuffle(X_resampled, y_resampled)
    #y_resampled = y_resampled.reshape(y_resampled.shape[0], 1, 1, 1) #da mettere se uso model_conv()
    y_resampled = to_categorical(y_resampled)#da mettere se uso model_flat()
    return(X_resampled, y_resampled)
train_generator_flow = train_img_pro.flow((x_train.reshape(x_train.shape[0],x_train.shape[1],1,1), y_train), 
                    batch_size=256,
                    shuffle=True #riordino i dati in maniera casuale
                    )   
train_generator_bal = balanced_flow_from_directory(train_generator_flow)

In [23]:
keras_model.fit_generator(generator=train_generator_bal,epochs=10, verbose= 1,steps_per_epoch = 200,class_weight=class_weight)

Epoch 1/10


UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above. [Op:Conv2D] name: sequential/conv1d/conv1d/

In [None]:
pred = keras_model.predict(x_val)

In [None]:
pred.argmax(axis = -1)

In [None]:
from sklearn.metrics import classification_report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val.argmax(axis=-1), pred.argmax(axis = -1)))