In [1]:
import os
import numpy as np 
import pandas as pd
import seaborn
import seaborn as sns
import shap # for SHAP value
import random
import warnings
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from bs4 import BeautifulSoup
import re

# The Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and 
# statistical natural language processing for English written in the Python programming language.
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


#TQDM is a progress bar library with good support for nested loops and Jupyter/IPython notebooks.
from tqdm import tqdm


# Use Keras Tensorflow deeplearning library

from tensorflow import set_random_seed

from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential


from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

from sklearn.metrics import roc_curve,auc,make_scorer, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix



import eli5 # for permutation importance
from eli5.sklearn import PermutationImportance

from pdpbox import pdp, info_plots # flor partial plots 

from scipy import interp

from itertools import cycle

from joblib import dump, load
seed_value = 123
os.environ['PYTHONHASHSEED']=str(seed_value)

Using TensorFlow backend.


In [2]:
os.getcwd()
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [3]:
set_random_seed(123)
random.seed(123)

In [4]:
np.random.seed(123)
pd.options.mode.chained_assignment = None  #hide any pandas warnings
%matplotlib inline

In [5]:
train2 = pd.read_csv("input/train.csv")
train2 = train2.rename(columns={'class':'Sentiment','tweet':'Phrase'})
train2 = train2.drop(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'],axis=1).copy()

target_t2 = train2.Sentiment.values
y_target_t2 = to_categorical(target_t2)

# Save the texts before tokenizing (must use the same random seed)
X_train_t2, X_val_t2, y_train_t2, y_val_t2 = train_test_split(train2,y_target_t2,
                                                              test_size=0.2,
                                                              random_state=123,
                                                              stratify=y_target_t2)

In [6]:
tweets = X_train_t2.Phrase

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string


import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import WordNetLemmatizer 
  
nltk.download('punkt')    
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stephengriggs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephengriggs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()



## Omari's version
def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
       '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-\:]+' #<<<<<<added the semicolon after the + to remove : at end of Rt's
    emoji_regex = '&#[0-9\;\:]+'    #<<<<<<<<<remove emoji's .ex; &#1214324
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = re.sub(emoji_regex,'',parsed_text)
    parsed_text = parsed_text.strip(string.punctuation)
    return parsed_text


def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split('\s|(?<!\d)[,.]|[,.](?!\d)', tweet.lower())).strip()
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens



def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()



vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords, #We do better when we keep stopwords
    use_idf=True,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.501
    )

In [9]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores
tfidf.shape

(19822, 5838)

In [10]:
vocab

{'!': 0,
 '! @@@@': 1,
 '! @@@@ top': 2,
 '!!': 3,
 '!!!': 4,
 '!!!!': 5,
 '"': 6,
 '" "': 7,
 '" -': 8,
 '" bitch': 9,
 '" fuck': 10,
 '" got': 11,
 '" hoe': 12,
 '" i\'m': 13,
 '" like': 14,
 '" thi': 15,
 '"a': 16,
 '"all': 17,
 '"bad': 18,
 '"bad bitch"': 19,
 '"bitch': 20,
 '"bitch"': 21,
 '"come': 22,
 '"damn': 23,
 '"do': 24,
 '"don\'t': 25,
 '"fuck': 26,
 '"fuck right': 27,
 '"go': 28,
 '"go talk': 29,
 '"good': 30,
 '"hey': 31,
 '"hoe': 32,
 '"hoes"': 33,
 '"i': 34,
 '"i ain\'t': 35,
 '"i got': 36,
 '"i want': 37,
 '"i\'m': 38,
 '"if': 39,
 '"in': 40,
 '"it': 41,
 '"it\'': 42,
 '"let': 43,
 '"look': 44,
 '"mi': 45,
 '"nah': 46,
 '"nigger"': 47,
 '"no': 48,
 '"oh': 49,
 '"pussi': 50,
 '"she': 51,
 '"so': 52,
 '"stop': 53,
 '"that': 54,
 '"that\'': 55,
 '"the': 56,
 '"these': 57,
 '"these hoe': 58,
 '"thi': 59,
 '"thi bitch': 60,
 '"u': 61,
 '"we': 62,
 '"what': 63,
 '"whi': 64,
 '"who': 65,
 '"you': 66,
 '"you might': 67,
 '"you might ghetto': 68,
 '"you think': 69,
 '"you\'r':

In [11]:
#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    #for i in range(0, len(tokens)):
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)
        #print(tokens[i],tag_list[i])

In [12]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.501,
    )

In [13]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}
pos.shape

(19822, 542)

In [14]:
#Now get other features
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *

sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    +
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [FKRA,
                FRE,
                syllables,
                avg_syl,
                num_chars,
                num_chars_total,
                num_terms,
                num_words,
                num_unique_terms,
                sentiment['neg'],
                sentiment['pos'],
                sentiment['neu'],
                sentiment['compound'],
                twitter_objs[2],
                twitter_objs[1],
                twitter_objs[0],
                retweet]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [15]:
other_features_names = ["FKRA",
                        "FRE",
                        "num_syllables",
                        "avg_syl_per_word",
                        "num_chars",
                        "num_chars_total",
                        "num_terms",
                        "num_words",
                        "num_unique_words",
                        "vader neg",
                        "vader pos",
                        "vader neu",
                        "vader compound",
                        "num_hashtags",
                        "num_mentions",
                        "num_urls",
                        "is_retweet"]

In [16]:
feats = get_feature_array(tweets)
feats

array([[ 6.  , 69.79, 15.  , ...,  2.  ,  0.  ,  0.  ],
       [ 8.  , 57.24, 18.  , ...,  0.  ,  0.  ,  0.  ],
       [ 7.2 , 75.77, 24.  , ...,  1.  ,  0.  ,  0.  ],
       ...,
       [ 8.4 , 52.87, 17.  , ...,  0.  ,  0.  ,  0.  ],
       [ 4.  , 89.51, 16.  , ...,  0.  ,  0.  ,  1.  ],
       [ 4.4 , 90.09, 18.  , ...,  0.  ,  0.  ,  0.  ]])

In [17]:
M = np.concatenate([tfidf,pos,feats],axis=1)
M.shape

(19822, 6397)

In [18]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

pos_variables = ['']*len(pos_vocab)
for k,v in pos_vocab.items():
    pos_variables[v] = k

feature_names = variables+pos_variables+other_features_names
feature_names

In [19]:
len(vocab)

5838

In [20]:
tfidf.shape

(19822, 5838)

In [21]:
M.shape

(19822, 6397)

In [22]:
# re-run model everytime new parameter changes (must run sequential to initialize epoch)
# Model with just acc as metric

model_2 = Sequential()
model_2.add(Embedding(len(vocab)+1, 300, input_length= tfidf.shape[1]))
model_2.add(LSTM(128,dropout=0.5,recurrent_dropout=0.5,return_sequences=True))
model_2.add(LSTM(64,dropout=0.5,recurrent_dropout=0.5,return_sequences=False))
model_2.add(Dense(100,activation='relu')) #try elu
model_2.add(Dropout(0.5))
model_2.add(Dense(3,activation='softmax'))
model_2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])
model_2.summary()

W0803 18:23:45.943271 4444489152 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0803 18:23:45.947700 4444489152 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0803 18:23:45.951297 4444489152 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0803 18:23:46.600074 4444489152 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0803 18:23:46.614512 4444489152 deprecati

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5838, 300)         1751700   
_________________________________________________________________
lstm_1 (LSTM)                (None, 5838, 128)         219648    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               6500      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 2,027,559
Trainable params: 2,027,559
Non-trainable params: 0
_________________________________________________________________


In [23]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor = 'val_acc', patience = 2)
callback = [early_stopping]

In [None]:
model_2.fit(
    tfidf, y_train_t2, 
    #validation_data=(X_val,y_val),
    epochs=15, #may not run all due to callback
    batch_size=256, #faster with larger batch_size but it's generalizing
    verbose=1)
    #callbacks=callback, #stops training once the model stops improving. Prevents overfitting.
#    class_weight={0: .94*100, 1: .23, 2: .83*10} #use inverse ratio to set hate with highest weight (somewhat arbitrary)
#) 

W0803 18:23:49.395061 4444489152 deprecation.py:323] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/15


In [72]:
M[112,6380]

9.1

In [38]:
len(vocab)

5838

In [40]:
len(pos_vocab)

542

In [52]:
M.shape

(19822, 6397)

In [55]:
for x in M[112]:
    if x != 0.0:
        print (x)

6.8514964431321035
5.890601405021195
8.815106169286818
6.924255797414532
8.94863756191134
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
2.0
1.0
1.0
1.0
2.0
2.0
2.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
2.0
2.0
1.0
2.0
9.1
61.67
27.0
1.5
115.0
117.0
18.0
18.0
18.0
1.0


In [None]:


np.ptp(feats)