**[GET DATA HERE](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data)**

In [1]:
import pandas as pd
import numpy as np
import time
import pickle

from imblearn.combine import SMOTETomek
from keras.utils import to_categorical
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.mixture import GaussianMixture
from sklearn import neighbors, ensemble,tree
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks')

In [4]:
#import test.csv
toxic = pd.read_csv('train.csv')

#feature engineering
toxic['toxic_total'] = (toxic['toxic'] + toxic['severe_toxic'] +
                        toxic['obscene'] + toxic['threat'] + 
                        toxic['insult'] + toxic['identity_hate'])
toxic['toxic_bool'] = np.where((toxic['toxic'] + toxic['severe_toxic'] +
                        toxic['obscene'] + toxic['threat'] + 
                        toxic['insult'] + toxic['identity_hate'])>0,1,0)
toxic.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_total,toxic_bool
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,0


In [0]:
# Define the features and the outcome.
X = toxic['comment_text']
y = toxic['toxic']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=42,
                                                    stratify=y)

In [6]:
y_train.head()

37665     1
124594    0
132186    0
149552    0
76104     0
Name: toxic, dtype: int64

In [7]:
start_time = time.time()


vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=3, # only use words that appear at least three times
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

#Applying the vectorizer
toxic_tfidf_train = vectorizer.fit_transform(X_train)
toxic_tfidf_test = vectorizer.transform(X_test)
print("vectorizer complete")

#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = toxic_tfidf_train.tocsr()
X_test_tfidf_csr = toxic_tfidf_test.tocsr()
print('tocsr complete')

#number of sentences
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per sentence
tfidf_bysent = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each sentence, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bysent[i][terms[j]] = X_train_tfidf_csr[i, j]
print('tf-idf complete')
    
# Normalize the data.
X_train_norm = normalize(X_train_tfidf_csr)
X_test_norm = normalize(X_test_tfidf_csr)
print('normalization complete')

# Convert y to categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
print('to_categorical complete')
t= round((time.time() - start_time),4)
print("\n -- %s seconds for results--" % t)

vectorizer complete
tocsr complete
tf-idf complete
normalization complete
to_categorical complete

 -- 57.0011 seconds for results--


In [8]:
y_test

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [0]:
#pickle.dump(vectorizer,open('tf_idf_vectorize.sav','wb'))
#vectorizer = pickle.load(open('tf_idf_vectorize.sav','rb'))

In [0]:
#X_train_vec = vectorizer.transform(X_train)
#X_test_vec = vectorizer.transform(X_test)

In [11]:
start_time = time.time()
smt = SMOTETomek(random_state=42)
#class balancing
#only perform on the training set, this reduces bias in fitting the model.
X_res, y_res = smt.fit_resample(X_train_norm, y_train)
#print(pd.Series(y_res).value_counts())

t= round((time.time() - start_time),4)
print("\n -- %s seconds for results--" % t)


 -- 1401.6027 seconds for results--


In [0]:
#save transformed and class-balanced training set
#pickle.dump(X_res,open('X_res_n.pickle','wb'))
#pickle.dump(y_res,open('y_res_n.pickle','wb'))

In [0]:
#load pickled X_res and y_res
X_pro = pickle.load(open('X_res_n.pickle','rb'))
y_pro = pickle.load(open('y_res_n.pickle','rb'))

In [10]:
X_pro.shape

(216390, 44443)

In [11]:
y_pro.shape

(216390, 2)

In [0]:
import tensorflow as tf
import keras
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Import various componenets for model building
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.layers import LSTM, Input, TimeDistributed
from keras.models import Model
from keras.optimizers import RMSprop

# Import the backend
from keras import backend as K

In [0]:
#from sklearn.preprocessing import OneHotEncoder
#enc = OneHotEncoder()
#enc.fit_transform(X_res.toarray())

In [13]:
# Initialize the constructor
model = Sequential()

# Add an input layer 
model.add(Dense(100, activation='relu', input_dim= X_pro.shape[1] ))

# Add a hidden layer 
model.add(Dense(100, activation='relu'))

# Add a hidden layer 
model.add(Dense(100, activation='relu'))

# Add an output layer 
model.add(Dense(2, activation='softmax'))

model.summary()

W0719 02:39:03.813142 139667886278528 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0719 02:39:03.837331 139667886278528 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0719 02:39:03.841295 139667886278528 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               4444400   
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 202       
Total params: 4,464,802
Trainable params: 4,464,802
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.compile(loss='mse',
              optimizer='adam',
              metrics=['accuracy'])
                   
model.fit(X_pro, y_pro,epochs=3, batch_size=512, verbose=1,validation_data=(X_test_norm, y_test))
score = model.evaluate(X_test_norm, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

W0719 02:39:20.729770 139667886278528 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0719 02:39:20.877304 139667886278528 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

W0719 02:39:20.991807 139667886278528 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:973: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.



Train on 216390 samples, validate on 39893 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test loss: 0.04507300149703966
Test accuracy: 0.9471335823342342


In [0]:
pickle.dump(model,open('tensorflow.sav','wb'))