## Cleaning twitter data to be used to predict the probability of profanity!

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import nltk

In [2]:
tweets = pd.read_csv("../data/hate-speech-and-offensive-language.csv", index_col = "Unnamed: 0" )

In [3]:
tweets

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...
25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [4]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

In [5]:
def clean_text_from_tweet(text, remove_stopwords = False):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''

    # Convert words to lower case
    text = text.lower()

    # removing non-ascii characters
    new_val = text.encode("ascii", "ignore")
    text = new_val.decode()

    # Replace contractions with their longer forms
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)

    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) # removes website links
    text = re.sub(r'\<a href', ' ', text) # removed HTML tags
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:$\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)

    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english")) # pulli-ng a list of stopwords from NLTK
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)


    return text

In [6]:
tweets["clean_tweet"] = tweets["tweet"].apply(clean_text_from_tweet)
tweets["clean_tweet_string"] = tweets["clean_tweet"].apply(lambda x: " ".join(x) )

In [7]:
tweets

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,clean_tweet,clean_tweet_string
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,"[rt, as, a, woman, you, should, not, complain,...",rt as a woman you should not complain about cl...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,"[rt, boy, dats, cold, tyga, dwn, bad, for, cuf...",rt boy dats cold tyga dwn bad for cuffin dat h...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,"[rt, dawg, rt, you, ever, fuck, a, bitch, and,...",rt dawg rt you ever fuck a bitch and she start...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,"[rt, g, anderson, based, she, look, like, a, t...",rt g anderson based she look like a tranny
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,"[rt, the, shit, you, hear, about, me, might, b...",rt the shit you hear about me might be true or...
...,...,...,...,...,...,...,...,...
25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,"[you, s, a, muthaf, in, lie, pearls, emanuel, ...",you s a muthaf in lie pearls emanuel right his...
25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an...","[you, ve, gone, and, broke, the, wrong, heart,...",you ve gone and broke the wrong heart baby and...
25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...,"[young, buck, wanna, eat, dat, nigguh, like, i...",young buck wanna eat dat nigguh like i aint fu...
25295,6,0,6,0,1,youu got wild bitches tellin you lies,"[youu, got, wild, bitches, tellin, you, lies]",youu got wild bitches tellin you lies


In [8]:
tweets['class'].value_counts(normalize= True)

1    0.774321
2    0.167978
0    0.057701
Name: class, dtype: float64

In [9]:
tweets_offensive_language = tweets.loc[(tweets['class'] == 1) | (tweets['class'] == 2)]

In [10]:
tweets_offensive_language["class"].value_counts(normalize = True)

1    0.821736
2    0.178264
Name: class, dtype: float64

### class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither

### Lets try spacy

In [11]:
import numpy as np
import multiprocessing as mp
import pandas as pd
import string
import spacy
from sklearn.base import TransformerMixin, BaseEstimator
from spacymoji import Emoji


In [12]:
# nlp = spacy.load("en_core_web_sm")
# nlp.add_pipe("emoji", first=True)
# doc = nlp("This is a test 😻 👍🏿")
#
# assert doc._.has_emoji is True
# assert doc[2:5]._.has_emoji is True
# assert doc[0]._.is_emoji is False
# assert doc[4]._.is_emoji is True
# assert doc[5]._.emoji_desc == "thumbs up dark skin tone"
# assert len(doc._.emoji) == 2
# assert doc._.emoji[1] == ("👍🏿", 5, "thumbs up dark skin tone")

### Loadint the spacy package and writing a custom preporcessing class

In [13]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("emoji", first=True)

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 nlp = nlp,
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Punctuation removal
            2. Stop words removal
            3. Removing handles/usernames
            4. Lemmatization

        nlp  - spacy model
        n_jobs - parallel jobs to run
        """
        self.nlp = nlp
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        doc = self.nlp(text)
        replace_emoji = self._replace_emoji(doc)
        doc = nlp(replace_emoji)

        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        removed_user_names = self._remove_user_names(removed_stop_words)
        removed_links = self._remove_links(removed_user_names)
        return self._lemmatize(removed_links)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _replace_emoji(self, doc):
        text = ''
        for t in doc:
            if t._.is_emoji:
                text += str(t._.emoji_desc) + ' '
            else:
                text +=  str(t) + ' '
        return text

    def _remove_punct(self, doc):
        return (t for t in doc if t.text not in string.punctuation)

    def _remove_stop_words(self, doc):
        return (t for t in doc if not t.is_stop)

    def _remove_user_names(self, doc):
        return (t for t in doc if '@' not in t.text)

    def _remove_links(self,doc):
        return (t for t in doc if not t.like_url)

    def _lemmatize(self, doc):
        return ' '.join(t.lemma_ for t in doc)

In [None]:
text = TextPreprocessor(n_jobs=-1).transform(tweets["tweet"][:100])

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import  LogisticRegressionCV
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics.pairwise import normalize
# from sklearn.model_selection import KFold
# from sklearn.svm import LinearSVC
#
# X = tweets_offensive_language["tweet"][:100]
# y = tweets_offensive_language["class"][:100]
#
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42, test_size= .3, stratify= y)
#
# clf  = Pipeline(steps=[
#     ('normalize', TextPreprocessor(n_jobs=-1)),
#     ('features', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True))
# ])
#
#
# clf.fit(X_train)