In [2]:

import pandas as pd
import numpy as np
import re
import string
import nltk
import matplotlib as plty
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
import regex as re
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time
from termcolor import colored
stop_words = set(stopwords.words('english'))


%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px

import plotly.graph_objs as go
pd.set_option('display.max_colwidth', None)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mitra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mitra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv("data/Sheet_1.csv")
df.dropna(inplace=True, axis=1 )
df.head(4)

Unnamed: 0,response_id,class,response_text
0,response_1,not_flagged,I try and avoid this sort of conflict
1,response_2,flagged,Had a friend open up to me about his mental addiction to weed and how it was taking over his life and making him depressed
2,response_3,flagged,"I saved a girl from suicide once. She was going to swallow a bunch of pills and I talked her out of it in a very calm, loving way."
3,response_4,not_flagged,i cant think of one really...i think i may have indirectly


In [4]:
fig = px.histogram(df, x='class', title='distribution of records')
fig.show()

Clean the data


In [5]:

def remove_stopwords(list_of_words):
    filtered_list = [w for w in list_of_words if not w.lower() in stop_words]
    return filtered_list

def check_punct(list_of_words):
    """
    look at the tokenized text. if there was any punctuation, it is redundant.
    """
    filtered_list = []
    for word in list_of_words:
        if re.findall("[()!><.,`?']", word):
            pass
        else: filtered_list.append(word)
        
    return filtered_list


In [6]:
def clean_and_tokenize(text):

    word_list = word_tokenize(text)
    word_list = remove_stopwords(word_list)
    word_list = check_punct(word_list)
    
    return word_list

In [7]:
df.loc[:, 'cleaned_word_list'] = df.loc[:, 'response_text'].apply(lambda t: clean_and_tokenize(t))

In [8]:
def join_tokens(df, tokens_arrays_col):
    return [" ".join(df.loc[i, tokens_arrays_col]) for i in range(len(df))]

df.loc[:, 'cleaned_text'] = join_tokens(df, 'cleaned_word_list')

In [9]:
df_f = df[df['class'] == 'flagged']
df_n = df[df['class'] == 'not_flagged']

Create a data frame with count and TFIDF scores for words

In [10]:
def count_words(tokens_arrays):
    """
    gets a dictionary and counts the values
    output: a sorted dict
    note: you can also use a bag of words package to do this
    """
    count_dict = {}
    for array_ in tokens_arrays:
        for word in array_:
            try: count_dict[word] +=1
            except: count_dict[word] = 1

    # sort 
    sorted_count_dict = {k:v for k,v in sorted(count_dict.items(), key=lambda item: item[1], reverse=True)}
    
    return sorted_count_dict

In [11]:
def get_n_key_and_value(n, dict_):

    """
    get the first - most frequent and important -
    words of dictionary 
    """
    keys = [k for (k, v) in dict_.items()][:n]
    values = [v for (k, v) in dict_.items()][:n]

    return keys, values

In [12]:
def convert_tokens_list_to_freq_df(tokens_arrays, n=-1):
    """
    gets the array of tokenized sentences
    output: a sorted dataframe with two cols
    the words and their frequency
    """

    dict_ = count_words(tokens_arrays)
    keys, values = get_n_key_and_value(n, dict_)

    df = pd.DataFrame({'words': keys, 'freq': values})

    return df

df_f_words_freq = convert_tokens_list_to_freq_df(df_f.cleaned_word_list)
df_n_words_freq = convert_tokens_list_to_freq_df(df_n.cleaned_word_list)

In [13]:
df_f_words_freq.head()

Unnamed: 0,words,freq
0,friend,13
1,people,10
2,friends,9
3,would,7
4,going,6


create TFIDF (Term Frequency – Inverse Document Frequency) scores


In [14]:
def get_tfidf_words_and_array(text_arrays):

    vectorizer = TfidfVectorizer()
    transformed_data = vectorizer.fit_transform(text_arrays).toarray()
    words = vectorizer.get_feature_names_out()
    
    return transformed_data, words

In [15]:
def create_tfidf_df(text_arrays):
    """
    gets the df, converts it into tfidf arrays and words
    then puts them in a dataset
    """

    transformed_data, words = get_tfidf_words_and_array(text_arrays)

    df = pd.DataFrame(data=transformed_data, columns=words).sum().reset_index()

    col_names = ['words', 'tfidf_score_sum']
    default_col_names = df.columns

    # rename whatever the df cols are called to the col_names
    df.rename(columns={default_col_names[i]:col_names[i] for i in range(len(col_names))}, inplace=True)

    return df

In [16]:
tfidf_scores_f = create_tfidf_df(df_f.loc[:, 'cleaned_text'])
tfidf_scores_n = create_tfidf_df(df_n.loc[:, 'cleaned_text'])

In [17]:
def merge(df_1, df_2, on='words'):
    return pd.merge(left=df_1, right=df_2, on=on, how='left')

In [18]:
f_words_df = merge(df_f_words_freq, tfidf_scores_f)
n_words_df = merge(df_n_words_freq, tfidf_scores_n)

### Why both Frequency and TFIDF score matter?
Based on the fact that words like 'people' appear in most records, it has a smaller tfidf score. However, it is appearing to be one of the most frequent words in flagged comments. So we can't focus only on one measurement. Also, a TFIDF score shows how much information a words brings. The more common, the more probable it is for a words to be a normal basic word which doesn't help in classification.

In [19]:
f_words_df.head(4)

Unnamed: 0,words,freq,tfidf_score_sum
0,friend,13,1.848679
1,people,10,0.877912
2,friends,9,1.279314
3,would,7,0.569055


In [20]:
# n most frequent
n = 30

In [21]:
fig = px.bar(
    data_frame=f_words_df[:n], x='words', y='freq', color='freq',
    text='tfidf_score_sum', labels={'x': 'words', 'y': 'frequency'},
    title='Frequency of words seen in <b> Flagged </b>records (with TFIDF in parentheses)' )

fig.update_xaxes(tickangle= -45)  
fig.update_traces(
    texttemplate='%{text:.2f}', textposition='outside',
     textfont_size=8)
     
fig.show()

In [22]:
fig = px.bar(
    data_frame=n_words_df[:n], x='words', y='freq', color='freq',
    text='tfidf_score_sum', labels={'x': 'words', 'y': 'frequency'},
    title='Frequency of words seen in <b> Flagged </b>records (with TFIDF in parentheses)')

fig.update_xaxes(tickangle= -45)  
fig.update_traces(
    texttemplate='%{text:.2f}', textposition='outside',
     textfont_size=8)
     
fig.show()

In [23]:

n_words_df = n_words_df.add_suffix('_n')
n_words_df.rename(columns={'words_n': 'words'}, inplace=True)

f_words_df = f_words_df.add_suffix('_f')
f_words_df.rename(columns={'words_f': 'words'}, inplace=True)

merged_df = merge(f_words_df, n_words_df)

In [24]:
merged_df

Unnamed: 0,words,freq_f,tfidf_score_sum_f,freq_n,tfidf_score_sum_n
0,friend,13,1.848679,9.0,2.241216
1,people,10,0.877912,10.0,2.230480
2,friends,9,1.279314,15.0,3.921829
3,would,7,0.569055,4.0,0.877480
4,going,6,1.168151,5.0,1.022397
...,...,...,...,...,...
343,spent,1,0.170482,,
344,nights,1,0.170482,,
345,letting,1,0.170482,2.0,0.705452
346,vent,1,0.170482,,


In [25]:
merged_df_sample = merged_df[:n]
fig = go.Figure(data=[
    go.Bar(name='Flagged', x=merged_df_sample.words, y=merged_df_sample.freq_f, text=merged_df_sample.freq_f, marker_color='#BA0F30'),
    go.Bar(name='Not Flagged', x=merged_df_sample.words, y=merged_df_sample.freq_n, text=merged_df_sample.freq_n, marker_color='#98D7C2')
])


fig.update_xaxes(tickangle= -45)  
fig.update_traces(
    texttemplate='%{text:d}', textposition='outside',
     textfont_size=8)

# Change the bar mode
fig.update_layout(barmode='group', title_text='most frequent Flagged/not flagged words count comparison')
fig.show()

I think the reason why friends is higher than friend in "not flagged" comments is because the people who have done somethings in general to help people out refer to friends a lot. Instead, people who have done something big for one particular person with a significant problem (flagged) will refer to friend. 

Side note: This is why I think it's better to not lemmatize words. This way the distinction between "friend" and "friends" is observable.

Now, Classification Begins...

### Naive Bayes

In [26]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()


In [27]:
transformed_data, words = get_tfidf_words_and_array(df.loc[:, 'cleaned_text'])
y = df.loc[:, 'class']

In [28]:
scores = cross_val_score(nb, transformed_data, y, cv=3)
scores

array([0.44444444, 0.37037037, 0.69230769])

### Random Forest

In [83]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier()
scores = cross_val_score(random_forest, transformed_data, y, cv=3)
scores

array([0.7037037 , 0.7037037 , 0.73076923])

# Deep learning 

In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


text = df.loc[:, 'response_text'].values
pad_type = 'post'
trunc_type='post'

# Tokenize our training data
tokenizer = Tokenizer()
texts = [str (item) for item in df.loc[:, 'response_text']]
tokenizer.fit_on_texts(texts)

# word index = {'word': idx}
word_index = tokenizer.word_index

# Encode into sequences
encoded_seqs = tokenizer.texts_to_sequences(texts)


In order to decide for the padding, we need to see how the distribution of words is. So to do that, we map the encoded sequence in a histogram and then decide for the padding length.

In [31]:
padded_lengths = [len(encoded_seqs_i) for encoded_seqs_i in encoded_seqs]
px.histogram(padded_lengths)

In [32]:
max_len = 60
padded_texts = pad_sequences(encoded_seqs, padding=pad_type, truncating=trunc_type, maxlen=max_len)

In [33]:
# Output the results of our work
random_i = np.random.randint(len(df))

print("Encoded sample:", encoded_seqs[random_i])
print("Padded sample:", padded_texts[random_i])
print("Padded shape:", padded_texts.shape)
print("sequences data type:", type(encoded_seqs))
print("Padded  sequences data type:", type(padded_texts))

Encoded sample: [40, 5, 355, 356, 6, 8, 18, 357, 2, 9, 11, 13, 1, 157, 158, 3, 358, 13, 20, 232, 62, 159]
Padded sample: [ 40   5 355 356   6   8  18 357   2   9  11  13   1 157 158   3 358  13
  20 232  62 159   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0]
Padded shape: (80, 60)
sequences data type: <class 'list'>
Padded  sequences data type: <class 'numpy.ndarray'>


### A simple RNN

In [34]:
import tensorflow as tf
import keras

from tensorflow.keras import layers
from sklearn.model_selection import KFold
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, SimpleRNN, BatchNormalization

For an RNN, we need to have a one-hot-encoded input. We can also use an embedding layer in the first layer of our sequential mode. But I chose to use 0-1 vectors to see how a very basic model works on this dataset.

In [35]:
num_features = len(word_index) + 1

In [36]:

def to_categorical_tensor(x, num_classes=num_features, max_len=max_len):
    """
    x: [0, 1, 2, 3]
    output: tensor of one hot encoded text
    with the shape of (max sequence length, number of features) 
    """
    if type(x) != list: a = list(x)
    a = tf.keras.utils.to_categorical(x, num_classes)
    a = tf.constant(a, shape=[max_len, num_classes])
    return a

sample = to_categorical_tensor(x= padded_texts[0])
print(sample.shape)

(60, 678)


In [37]:
# one-hot-encoding
X = [to_categorical_tensor(padded_texts[i]) for i in range(len(df))]

# converting into an array
X_final = np.reshape(X, (len(X), max_len, num_features))

In [38]:
labels = np.array([1 if label=='flagged' else 0 for label in df.loc[:, 'class']])

In [87]:
def create_simpleRNN(rnn_nodes=32):

    model = Sequential([
        SimpleRNN(32, input_shape=(max_len, num_features)),
        BatchNormalization(),
        Dense(1, activation="sigmoid")
        ])
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

    return model

model = create_simpleRNN()
model.summary()

Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_29 (SimpleRNN)   (None, 32)                22752     
                                                                 
 batch_normalization_29 (Bat  (None, 32)               128       
 chNormalization)                                                
                                                                 
 dense_29 (Dense)            (None, 1)                 33        
                                                                 
Total params: 22,913
Trainable params: 22,849
Non-trainable params: 64
_________________________________________________________________


In [107]:
n_epochs = 3
n_splits = 3
batch_s = 32 # the default is 32
mean_val_acc_models = {}

def evaluate(X, Y, model_, n_epochs=3, n_splits=3, batch_s=32):
    """
    Evaluates each model with cross validation
    """
    history = {}
    kfold = KFold(n_splits)
    splits = kfold.split(X)

    for i, (train_index, val_index) in enumerate(splits):

        print(f'\nfold {i+1}')
        model = model_()

        train_x = X[train_index]
        train_y = Y[train_index]

        val_x = X[val_index]
        val_y = Y[val_index]

        history[f'fold {i}'] = model.fit(
            train_x, train_y, epochs=n_epochs, batch_size=batch_s, verbose=2,
             validation_data=(val_x, val_y)).history
    mean_val_acc = [np.mean(hist['val_accuracy']) for hist in history.values()]
    return mean_val_acc

In [108]:
mean_val_acc_models['SimpleRNN'] = evaluate(X=X_final, Y=labels, model_=create_simpleRNN)


fold 1
Epoch 1/3
2/2 - 2s - loss: 0.7385 - accuracy: 0.4906 - val_loss: 0.6774 - val_accuracy: 0.5926 - 2s/epoch - 821ms/step
Epoch 2/3
2/2 - 0s - loss: 0.5577 - accuracy: 0.8113 - val_loss: 0.6731 - val_accuracy: 0.5926 - 69ms/epoch - 35ms/step
Epoch 3/3
2/2 - 0s - loss: 0.4953 - accuracy: 0.8491 - val_loss: 0.6731 - val_accuracy: 0.5926 - 65ms/epoch - 33ms/step

fold 2
Epoch 1/3
2/2 - 2s - loss: 0.9235 - accuracy: 0.4717 - val_loss: 0.6157 - val_accuracy: 0.8148 - 2s/epoch - 819ms/step
Epoch 2/3
2/2 - 0s - loss: 0.7021 - accuracy: 0.5660 - val_loss: 0.6028 - val_accuracy: 0.8148 - 70ms/epoch - 35ms/step
Epoch 3/3
2/2 - 0s - loss: 0.5630 - accuracy: 0.7170 - val_loss: 0.5963 - val_accuracy: 0.8148 - 75ms/epoch - 37ms/step

fold 3
Epoch 1/3
2/2 - 1s - loss: 0.9060 - accuracy: 0.5185 - val_loss: 0.7140 - val_accuracy: 0.3077 - 1s/epoch - 725ms/step
Epoch 2/3
2/2 - 0s - loss: 0.6007 - accuracy: 0.7037 - val_loss: 0.6976 - val_accuracy: 0.3462 - 57ms/epoch - 29ms/step
Epoch 3/3
2/2 - 0s 

In [104]:
mean_val_acc

[0.6296296119689941, 0.5185185174147288, 0.7692307829856873]

In [None]:
def create_GRU(rnn_nodes=32):

    model = Sequential([
        SimpleRNN(32, input_shape=(max_len, num_features)),
        BatchNormalization(),
        Dense(1, activation="sigmoid")
        ])
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

    return model

model = create_simpleRNN()
model.summary()

#### Experiment

In [61]:
a = tf.keras.utils.to_categorical([0, 1, 2, 3], num_classes=4)
a = tf.constant(a, shape=[4, 4]) # returns a tensor

In [62]:
type(padded_texts[0])

numpy.ndarray

In [63]:
num_classes = len(word_index) + 1
x = padded_texts[0]
a = tf.keras.utils.to_categorical(x, num_classes)

In [64]:
a.shape

(60, 677)

In [71]:
vocab_len = len(word_index)

In [107]:

def to_categorical_tensor(x, num_classes=vocab_len+1, max_len=max_len):
    """
    x: [0, 1, 2, 3]
    output: tensor of one hot encoded text
    with the shape of (None, max sequence length, number of features) 
    """
    if type(x) != list: a = list(x)
    a = tf.keras.utils.to_categorical(x, num_classes)
    a = tf.constant(a, shape=[max_len, num_classes])
    return a

sample = to_categorical_tensor(x= padded_texts[0])
print(sample.shape)

(60, 678)


In [82]:
sample.shape

TensorShape([60, 678])

In [84]:
padded_texts.shape

(80, 60)

In [85]:
X = [to_categorical_tensor(padded_texts[i]) for i in range(len(df))]
X_final = np.reshape(X, (len(X), max_len, vocab_len+1))

In [109]:
X_final.shape

(80, 60, 678)

In [43]:
mnist = keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
#x_train, x_test = x_train/255.0, x_test/255.0
sample, sample_label = x_test[0], y_test[0]

In [93]:
y_test.shape

(10000,)

In [94]:
y_test[0]

7

In [96]:
labels = np.array([1 if label=='flagged' else 0 for label in df.loc[:, 'class']])

In [97]:
labels.shape

(80,)

In [47]:
x_test.shape

(10000, 28, 28)

In [44]:
sample.shape

(28, 28)

In [45]:
sample[10]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  17,  66,
        14,  67,  67,  67,  59,  21, 236, 254, 106,   0,   0,   0,   0,
         0,   0], dtype=uint8)

In [104]:
model = keras.Sequential()
model.add(layers.SimpleRNN(32, input_shape=(max_len, vocab_len+1)))
model.add(layers.BatchNormalization())
model.add(layers.Dense(1, activation="sigmoid"))
print(model.summary())
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_3 (SimpleRNN)    (None, 32)                22752     
                                                                 
 batch_normalization_2 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 22,913
Trainable params: 22,849
Non-trainable params: 64
_________________________________________________________________
None


In [106]:
num_epochs = 3
history = model.fit(X_final, labels, epochs=num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
labels = 

In [34]:
embedding_dim = 16
vocab_size = len(tokenizer.index_word)

model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, input_length=maxlen),
    SimpleRNN(units=32, activation="relu"),
    #GlobalAveragePooling1D(),
    #Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 16)            10832     
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                1568      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 12,433
Trainable params: 12,433
Non-trainable params: 0
_________________________________________________________________


In [44]:
np.random.random([32, 10, 8]).astype(np.float32)#.shape

(32, 10, 8)

In [45]:
df.head(1)

Unnamed: 0,response_id,class,response_text,cleaned_word_list,cleaned_text
0,response_1,not_flagged,I try and avoid this sort of conflict,"[try, avoid, sort, conflict]",try avoid sort conflict


In [47]:
labels = np.array([1 if label=='flagged' else 0 for label in df.loc[:, 'class']])

In [99]:
X_final.shape

AttributeError: 'tuple' object has no attribute 'shape'

In [102]:
num_epochs = 3
history = model.fit(X_final, labels, epochs=num_epochs)

Epoch 1/3


ValueError: in user code:

    File "c:\Users\mitra\anaconda3\lib\site-packages\keras\engine\training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\mitra\anaconda3\lib\site-packages\keras\engine\training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\mitra\anaconda3\lib\site-packages\keras\engine\training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\mitra\anaconda3\lib\site-packages\keras\engine\training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\mitra\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\mitra\anaconda3\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential_1" is incompatible with the layer: expected shape=(None, None, 60), found shape=(None, 60, 678)


In [89]:
model = Sequential()
model.add(SimpleRNN(units=32, input_shape=(1,maxlen), activation="relu"))
model.add(Dense(8, activation="relu")) 
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='rmsprop')
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 32)                2976      
                                                                 
 dense (Dense)               (None, 8)                 264       
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 3,249
Trainable params: 3,249
Non-trainable params: 0
_________________________________________________________________


In [120]:
encoded_seqs_conv = np.array(np.array(encoded_seqs[i]) for i in range(len(encoded_seqs)))

In [123]:
encoded_seqs_conv

array(<generator object <genexpr> at 0x000000001A7F7EB0>, dtype=object)

In [119]:
label = df.loc[:, 'class'].values
trainX = np.reshape(encoded_seqs_conv, (encoded_seqs_conv.shape[0], 1, encoded_seqs_conv.shape[1]))

IndexError: tuple index out of range

In [94]:


model.fit(trainX,label, epochs=100, batch_size=16, verbose=2)
trainPredict = model.predict(trainX)
testPredict= model.predict(trainX)
predicted=np.concatenate((trainPredict,testPredict),axis=0)

AttributeError: 'list' object has no attribute 'shape'