In [1]:

import pandas as pd
import numpy as np
import re
import string
import nltk
import matplotlib as plty
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
import regex as re
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))


%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px

import plotly.graph_objs as go
pd.set_option('display.max_colwidth', None)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mitra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mitra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv("data/Sheet_1.csv")
df.dropna(inplace=True, axis=1 )
df.head(4)

Unnamed: 0,response_id,class,response_text
0,response_1,not_flagged,I try and avoid this sort of conflict
1,response_2,flagged,Had a friend open up to me about his mental addiction to weed and how it was taking over his life and making him depressed
2,response_3,flagged,"I saved a girl from suicide once. She was going to swallow a bunch of pills and I talked her out of it in a very calm, loving way."
3,response_4,not_flagged,i cant think of one really...i think i may have indirectly


In [3]:
fig = px.histogram(df, x='class', title='distribution of records')
fig.show()

Clean the data


In [4]:

def remove_stopwords(list_of_words):
    filtered_list = [w for w in list_of_words if not w.lower() in stop_words]
    return filtered_list

def check_punct(list_of_words):
    """
    look at the tokenized text. if there was any punctuation, it is redundant.
    """
    filtered_list = []
    for word in list_of_words:
        if re.findall("[()!><.,`?']", word):
            pass
        else: filtered_list.append(word)
        
    return filtered_list


In [5]:
def clean_and_tokenize(text):

    word_list = word_tokenize(text)
    word_list = remove_stopwords(word_list)
    word_list = check_punct(word_list)
    
    return word_list

In [6]:
df.loc[:, 'cleaned_word_list'] = df.loc[:, 'response_text'].apply(lambda t: clean_and_tokenize(t))

In [7]:
def join_tokens(df, tokens_arrays_col):
    return [" ".join(df.loc[i, tokens_arrays_col]) for i in range(len(df))]

df.loc[:, 'cleaned_text'] = join_tokens(df, 'cleaned_word_list')

In [8]:
df_f = df[df['class'] == 'flagged']
df_n = df[df['class'] == 'not_flagged']

Create a data frame with count and TFIDF scores for words

In [9]:
def count_words(tokens_arrays):
    """
    gets a dictionary and counts the values
    output: a sorted dict
    note: you can also use a bag of words package to do this
    """
    count_dict = {}
    for array_ in tokens_arrays:
        for word in array_:
            try: count_dict[word] +=1
            except: count_dict[word] = 1

    # sort 
    sorted_count_dict = {k:v for k,v in sorted(count_dict.items(), key=lambda item: item[1], reverse=True)}
    
    return sorted_count_dict

In [10]:
def get_n_key_and_value(n, dict_):

    """
    get the first - most frequent and important -
    words of dictionary 
    """
    keys = [k for (k, v) in dict_.items()][:n]
    values = [v for (k, v) in dict_.items()][:n]

    return keys, values

In [11]:
def convert_tokens_list_to_freq_df(tokens_arrays, n=-1):
    """
    gets the array of tokenized sentences
    output: a sorted dataframe with two cols
    the words and their frequency
    """

    dict_ = count_words(tokens_arrays)
    keys, values = get_n_key_and_value(n, dict_)

    df = pd.DataFrame({'words': keys, 'freq': values})

    return df

df_f_words_freq = convert_tokens_list_to_freq_df(df_f.cleaned_word_list)
df_n_words_freq = convert_tokens_list_to_freq_df(df_n.cleaned_word_list)

In [12]:
df_f_words_freq.head()

Unnamed: 0,words,freq
0,friend,13
1,people,10
2,friends,9
3,would,7
4,going,6


create TFIDF (Term Frequency – Inverse Document Frequency) scores


In [13]:
def get_tfidf_words_and_array(text_arrays):

    vectorizer = TfidfVectorizer()
    transformed_data = vectorizer.fit_transform(text_arrays).toarray()
    words = vectorizer.get_feature_names_out()
    
    return transformed_data, words

In [14]:
def create_tfidf_df(text_arrays):
    """
    gets the df, converts it into tfidf arrays and words
    then puts them in a dataset
    """

    transformed_data, words = get_tfidf_words_and_array(text_arrays)

    df = pd.DataFrame(data=transformed_data, columns=words).sum().reset_index()

    col_names = ['words', 'tfidf_score_sum']
    default_col_names = df.columns

    # rename whatever the df cols are called to the col_names
    df.rename(columns={default_col_names[i]:col_names[i] for i in range(len(col_names))}, inplace=True)

    return df

In [15]:
tfidf_scores_f = create_tfidf_df(df_f.loc[:, 'cleaned_text'])
tfidf_scores_n = create_tfidf_df(df_n.loc[:, 'cleaned_text'])

In [16]:
def merge(df_1, df_2, on='words'):
    return pd.merge(left=df_1, right=df_2, on=on, how='left')

In [17]:
f_words_df = merge(df_f_words_freq, tfidf_scores_f)
n_words_df = merge(df_n_words_freq, tfidf_scores_n)

### Why both Frequency and TFIDF score matter?
Based on the fact that words like 'people' appear in most records, it has a smaller tfidf score. However, it is appearing to be one of the most frequent words in flagged comments. So we can't focus only on one measurement. Also, a TFIDF score shows how much information a words brings. The more common, the more probable it is for a words to be a normal basic word which doesn't help in classification.

In [18]:
f_words_df.head(4)

Unnamed: 0,words,freq,tfidf_score_sum
0,friend,13,1.848679
1,people,10,0.877912
2,friends,9,1.279314
3,would,7,0.569055


In [19]:
# n most frequent
n = 30

In [20]:
fig = px.bar(
    data_frame=f_words_df[:n], x='words', y='freq', color='freq',
    text='tfidf_score_sum', labels={'x': 'words', 'y': 'frequency'},
    title='Frequency of words seen in <b> Flagged </b>records (with TFIDF in parentheses)' )

fig.update_xaxes(tickangle= -45)  
fig.update_traces(
    texttemplate='%{text:.2f}', textposition='outside',
     textfont_size=8)
     
fig.show()

In [21]:
fig = px.bar(
    data_frame=n_words_df[:n], x='words', y='freq', color='freq',
    text='tfidf_score_sum', labels={'x': 'words', 'y': 'frequency'},
    title='Frequency of words seen in <b> Flagged </b>records (with TFIDF in parentheses)')

fig.update_xaxes(tickangle= -45)  
fig.update_traces(
    texttemplate='%{text:.2f}', textposition='outside',
     textfont_size=8)
     
fig.show()

In [22]:

n_words_df = n_words_df.add_suffix('_n')
n_words_df.rename(columns={'words_n': 'words'}, inplace=True)

f_words_df = f_words_df.add_suffix('_f')
f_words_df.rename(columns={'words_f': 'words'}, inplace=True)

merged_df = merge(f_words_df, n_words_df)

In [23]:
merged_df

Unnamed: 0,words,freq_f,tfidf_score_sum_f,freq_n,tfidf_score_sum_n
0,friend,13,1.848679,9.0,2.241216
1,people,10,0.877912,10.0,2.230480
2,friends,9,1.279314,15.0,3.921829
3,would,7,0.569055,4.0,0.877480
4,going,6,1.168151,5.0,1.022397
...,...,...,...,...,...
343,spent,1,0.170482,,
344,nights,1,0.170482,,
345,letting,1,0.170482,2.0,0.705452
346,vent,1,0.170482,,


In [24]:
merged_df_sample = merged_df[:n]
fig = go.Figure(data=[
    go.Bar(name='Flagged', x=merged_df_sample.words, y=merged_df_sample.freq_f, text=merged_df_sample.freq_f, marker_color='#BA0F30'),
    go.Bar(name='Not Flagged', x=merged_df_sample.words, y=merged_df_sample.freq_n, text=merged_df_sample.freq_n, marker_color='#98D7C2')
])


fig.update_xaxes(tickangle= -45)  
fig.update_traces(
    texttemplate='%{text:d}', textposition='outside',
     textfont_size=8)

# Change the bar mode
fig.update_layout(barmode='group', title_text='most frequent Flagged/not flagged words count comparison')
fig.show()

I think the reason why friends is higher than friend in "not flagged" comments is because the people who have done somethings in general to help people out refer to friends a lot. Instead, people who have done something big for one particular person with a significant problem (flagged) will refer to friend. 

Side note: This is why I think it's better to not lemmatize words. This way the distinction between "friend" and "friends" is observable.

Now, Classification Begins...

### Naive Bayes

In [28]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()


In [29]:
transformed_data, words = get_tfidf_words_and_array(df.loc[:, 'cleaned_text'])
y = df.loc[:, 'class']

In [30]:
scores = cross_val_score(nb, transformed_data, y, cv=3)
scores

array([0.44444444, 0.37037037, 0.69230769])

In [None]:
from sklearn.cross_validation import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1)

nb.fit(x_train_dtm,y_train)
y_predict = NB.predict(x_test_dtm)
metrics.accuracy_score(y_test,y_predict)