In [1]:
import re
import nltk
import string
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [22]:
import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option("display.max_colwidth", 200) 
warnings.filterwarnings("ignore", category=DeprecationWarning) 

%matplotlib inline

## Loading Data and Preparation for the model

In [172]:
train = pd.read_csv("D:\Machine Learning\Analytics Vidhya\\train_E6oV3lV.csv")
test = pd.read_csv("D:\Machine Learning\Analytics Vidhya\\test_tweets_anuFYb8.csv")

In [173]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation


In [175]:
combi = train.append(test,ignore_index = True)
combi.shape

(49159, 3)

In [176]:
def remove_pattern(input_txt,pattern):
    
    r = re.findall(pattern,input_txt)
    for i in r:
        input_txt = re.sub(i,'',input_txt)
    
    return input_txt

In [177]:
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'],"@[\w]*")
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))

In [178]:
tweets = combi['tidy_tweet'].values
tweets_len = [len(t.split()) for t in tweets]

In [179]:
max(tweets_len)

20

In [180]:
combi.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,when father dysfunctional selfish drags kids into dysfunction #run
1,2,0.0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks #lyft credit cause they offer wheelchair vans #disapointed #getthanked
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,#model love take with time
4,5,0.0,factsguide: society now #motivation,factsguide society #motivation


In [181]:
combi_train = combi.iloc[:31962,:]

### Tokenizing and Padding

In [223]:
max_features = 400000
max_text_length = 21

In [224]:
X_train, X_val, y_train, y_val = train_test_split(combi_train.tidy_tweet.values, 
                                                  combi_train.label.values, 
                                                  test_size=0.15, 
                                                  random_state=17, 
                                                  stratify=combi_train.label.values)

In [225]:
x_tokenizer = text.Tokenizer(max_features)  # will convert into corresponding index and will discard the rest(only 1st 20000)
x_tokenizer.fit_on_texts(list(combi_train.tidy_tweet.values))

In [226]:
x_train_tokenized = x_tokenizer.texts_to_sequences(X_train)
x_train = sequence.pad_sequences(x_train_tokenized,maxlen = max_text_length)

x_val_tokenized = x_tokenizer.texts_to_sequences(X_val)
x_val = sequence.pad_sequences(x_val_tokenized,maxlen = max_text_length)

In [186]:
embedding_dim = 100
embeddings_index ={}
f = open('C:\\Users\\Sayan6619\\Twitter Sentiment Analysis\\TEXT CLASSI WITH CONVO\\glove.6B.100d.txt',encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype = 'float32')
    embeddings_index[word] = coefs

f.close()

print(f'Found {len(embeddings_index)} word vectors')

Found 400000 word vectors


In [227]:
embedding_matrix = np.zeros((max_features,embedding_dim))

for word,index in x_tokenizer.word_index.items():
    if index>max_features-1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

## Model

In [293]:
model = Sequential()
model.add(Embedding(max_features,
                   embedding_dim,
                   embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                   trainable=False))
model.add(Dropout(0.2))

In [294]:
filters = 250
kernel_size = 3
hidden_dims = 250

In [295]:
model.add(Conv1D(filters,
         kernel_size,
         padding='valid'))

model.add(MaxPooling1D())

model.add(Conv1D(filters,
                5,
                padding='valid',
                activation='relu'))

model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, None, 100)         40000000  
_________________________________________________________________
dropout_26 (Dropout)         (None, None, 100)         0         
_________________________________________________________________
conv1d_26 (Conv1D)           (None, None, 250)         75250     
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, None, 250)         0         
_________________________________________________________________
conv1d_27 (Conv1D)           (None, None, 250)         312750    
_________________________________________________________________
global_max_pooling1d_13 (Glo (None, 250)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 250)             

In [296]:
from keras import backend as K

In [297]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [298]:
# tf.keras.metrics.Precision(),tf.keras.metrics.Recall()

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[tf.keras.metrics.Precision(),tf.keras.metrics.Recall()])

### Training the Model 

In [299]:
# x_train,x_val,y_train,y_val = train_test_split(x_train_val,y,test_size=0.15,random_state=1)

In [300]:
%%time

# batch_size=32
epoch=10

model.fit(x_train,y_train,
#          batch_size=batch_size,
         epochs=epoch,
         validation_data = (x_val,y_val))

Train on 27167 samples, validate on 4795 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 3min 24s


<tensorflow.python.keras.callbacks.History at 0x2c8dfbcdb08>

In [313]:
(2*0.9119*0.8305)/(0.9119+0.8305)

0.8692986111111112

### Evaluate the Model

In [301]:
x_test = combi['tidy_tweet'][31962:].values

In [302]:
x_test_tokenized = x_tokenizer.texts_to_sequences(x_test)
x_testing = sequence.pad_sequences(x_test_tokenized,maxlen = max_text_length)

In [303]:
y_testing = model.predict(x_testing,verbose = 1, batch_size=32)



In [304]:
y_testing.shape

(17197, 1)

In [305]:
y_testing[0]

array([0.00313842], dtype=float32)

In [306]:
test['label'] = ['non_racist' if x<.5 else 'racist' for x in y_testing]

In [309]:
test.loc[test['label']=='racist']

Unnamed: 0,id,tweet,label
1,31964,@user #white #supremacists want everyone to see the new â #birdsâ #movie â and hereâs why,racist
10,31973,1000dayswasted - narcosis infinite ep.. make me aware.. grinding neuro bass #lifestyle,racist
19,31982,thought factory: bbc neutrality on right wing fascism #politics #media #blm #brexit #trump #leadership &gt;3,racist
26,31989,chick gets fucked hottest naked lady,racist
33,31996,suppo the #taiji fisherman! no bullying! no racism! #tweet4taiji #thecove #seashepherd,racist
...,...,...,...
17161,49124,@user fuck yes!! @user mr money in the bank ðð so dam proud!! @user #mitb #ambroseasylum,racist
17176,49139,@user @user are the most racist pay ever!!!!!,racist
17188,49151,"black professor demonizes, proposes nazi style confiscation of ""white"" assets; like 1930's germany #breaking",racist
17192,49155,thought factory: left-right polarisation! #trump #uselections2016 #leadership #politics #brexit #blm &gt;3,racist


In [310]:
len(test.loc[test['label']=='non_racist'])

16135

In [311]:
len(test.loc[test['label']=='racist'])

1062

In [316]:
# model.save('Conv1d_text_classifier.h5',model)

In [317]:
 tf.keras.models.save_model(model,'Conv1d_text_classifier')

INFO:tensorflow:Assets written to: Conv1d_text_classifier\assets


In [320]:
m = tf.keras.models.load_model('Conv1d_text_classifier')