In [1]:
import pandas as pd

## Merging Two Different Datasets Into One

In [2]:
hate_speech1 = pd.read_csv('/users/nchib/DS5/DS-Unit-2-Project/Data/hatespeech_text_label_vote.csv', header=None, sep='\t')
hate_speech1 = hate_speech1.drop(columns=[2])
hate_speech1.head()

Unnamed: 0,0,1
0,Beats by Dr. Dre urBeats Wired In-Ear Headphon...,spam
1,RT @Papapishu: Man it would fucking rule if we...,abusive
2,It is time to draw close to Him &#128591;&#127...,normal
3,if you notice me start to act different or dis...,normal
4,"Forget unfollowers, I believe in growing. 7 ne...",normal


In [3]:
hate_speech1 = hate_speech1.rename(columns={0: "text", 1: 'label'})
new_values = {'hateful': 0, 'abusive': 1, 'normal': 2, 'spam': 3}
hate_speech1['label'] = hate_speech1['label'].map(new_values)

In [16]:
hate_speech1['label'].value_counts()

2    53851
1    27150
3    14030
0     4965
Name: label, dtype: int64

In [7]:
hate_speech_part2 = pd.read_csv('https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv')

In [8]:
hate_speech_part2 = hate_speech_part2.rename(columns={"class": "label", 'tweet': 'text'})
hate_speech_part2 = hate_speech_part2.drop(columns=['Unnamed: 0','count','hate_speech','offensive_language','neither'])
hate_speech_part2.head()

Unnamed: 0,label,text
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [10]:
hate_speech_part2['label'].value_counts()

1    19190
2     4163
0     1430
Name: label, dtype: int64

### Third Dataset won't merge for now - for later in case decide to oversample minority class

In [11]:
tweets_part3 = pd.read_csv('/users/nchib/DS5/DS-Unit-2-Project/Data/tweets_part1.csv')
tweets_part3.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,text
0,0,848337741813358592,fucks sake go away stupid anon — ^ https://t....
1,1,850344984742174720,Carlos Correa had gyalchester as his walkup mu...
2,2,848668638869671939,Damn dean just put Corbin to sleep. That Match...
3,3,848338236770582529,Dick Tracy Meets Gruesome - the 2017 re-boot\n...
4,4,847542736651767809,what idiot called them antacids and not afterb...


In [12]:
tweets_part3['label'] = 0
tweets_part3.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,text,label
0,0,848337741813358592,fucks sake go away stupid anon — ^ https://t....,0
1,1,850344984742174720,Carlos Correa had gyalchester as his walkup mu...,0
2,2,848668638869671939,Damn dean just put Corbin to sleep. That Match...,0
3,3,848338236770582529,Dick Tracy Meets Gruesome - the 2017 re-boot\n...,0
4,4,847542736651767809,what idiot called them antacids and not afterb...,0


In [13]:
hate_speech_part3 = tweets_part3.drop(columns=['Unnamed: 0','tweet_id'])
hate_speech_part3['label'].value_counts()

0    63928
Name: label, dtype: int64

### Merging first two

In [20]:
df_final = pd.concat([hate_speech1, hate_speech_part2], ignore_index=True)
df_final.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,label,text
0,3,Beats by Dr. Dre urBeats Wired In-Ear Headphon...
1,1,RT @Papapishu: Man it would fucking rule if we...
2,2,It is time to draw close to Him &#128591;&#127...
3,2,if you notice me start to act different or dis...
4,2,"Forget unfollowers, I believe in growing. 7 ne..."


In [23]:
df_final['label'].value_counts()

2    58014
1    46340
3    14030
0     6395
Name: label, dtype: int64

In [25]:
df_final.shape

(124779, 2)

## Dropping duplicates and spam

In [50]:
text = df_final['text']
duplicate_frame = df_final[text.isin(text[text.duplicated()])]

In [73]:
duplicate_frame['text'].values[10]

'RT @WaysThingsWork: I fucking hate people &#128514;&#128514; https://t.co/Qz5gihmcQF'

In [70]:
df_final = df_final.drop_duplicates(subset=['text'], keep='first')
df_final.shape

(116734, 2)

In [75]:
df_final['label'].value_counts()

2    57399
1    39892
3    13894
0     5549
Name: label, dtype: int64

In [76]:
df_final = df_final[df_final['label'] != 3]

In [132]:
df_final.shape

(102840, 2)

In [131]:
df_final.to_csv('/users/nchib/DS5/DS-Unit-2-Project/Data/df_final1.csv')

## Baseline - predict majority class

In [98]:
df_final['label'].value_counts(normalize=True)

2    0.558139
1    0.387904
0    0.053958
Name: label, dtype: float64

If I predicted the majority class for every prediction, my baseline accuracy would be 0.558139

## Feature Engineering

In [91]:
tweets = df_final['text']

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re

In [89]:
nltk.download('stopwords')

stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    #parsed_text = parsed_text.code("utf-8", errors='ignore')
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    #tokens = re.split("[^a-zA-Z]*", tweet.lower())
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords, #We do better when we keep stopwords
    use_idf=True,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.501
    )

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nchib\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [92]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

  'stop_words.' % sorted(inconsistent))


In [94]:
nltk.download('averaged_perceptron_tagger')

#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    #for i in range(0, len(tokens)):
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)
        #print(tokens[i],tag_list[i])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nchib\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [95]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.501,
    )

In [96]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

In [101]:
#Now get other features
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *

sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [102]:
other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", "vader compound", \
                        "num_hashtags", "num_mentions", "num_urls", "is_retweet"]

In [103]:
feats = get_feature_array(tweets)

In [104]:
#Now join them all up
M = np.concatenate([tfidf,pos,feats],axis=1)

In [105]:
M.shape

(102840, 5601)

In [111]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

pos_variables = ['']*len(pos_vocab)
for k,v in pos_vocab.items():
    pos_variables[v] = k

feature_names = variables+pos_variables+other_features_names

In [113]:
X = pd.DataFrame(M)
y = df_final['label'].astype(int)

In [114]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

In [115]:
select = SelectFromModel(LogisticRegression(class_weight='balanced',penalty="l1",C=0.01))
X_ = select.fit_transform(X,y)



In [124]:
model = LogisticRegression(class_weight='balanced',penalty="l1",C=0.01).fit(X_, y)



In [125]:
y_preds = model.predict(X_)

In [126]:
report = classification_report( y, y_preds )


In [127]:
print(report)


              precision    recall  f1-score   support

           0       0.31      0.31      0.31      5549
           1       0.88      0.84      0.86     39892
           2       0.90      0.93      0.91     57399

    accuracy                           0.86    102840
   macro avg       0.70      0.69      0.69    102840
weighted avg       0.86      0.86      0.86    102840



## Using Basilica

In [135]:
import basilica
with basilica.Connection('8b63363f-6814-0848-1bd4-97ed95d75fb9') as c:
    embeddings = c.embed_sentences(tweets, model="twitter")
    print(list(embeddings))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

