## Cleaning twitter data to be used to predict the probability of profanity!

In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import nltk

In [48]:
tweets = pd.read_csv("..\Data\hate-speech-and-offensive-language.csv", index_col = "Unnamed: 0" )

In [49]:
tweets

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...
25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [50]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

In [54]:
def clean_text_from_tweet(text, remove_stopwords = False):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''

    # Convert words to lower case
    text = text.lower()

    # removing non-ascii characters
    new_val = text.encode("ascii", "ignore")
    text = new_val.decode()

    # Replace contractions with their longer forms
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)

    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) # removes website links
    text = re.sub(r'\<a href', ' ', text) # removed HTML tags
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:$\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)

    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english")) # pulli-ng a list of stopwords from NLTK
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)

    return text

In [52]:
tweets["clean_tweet"] = tweets["tweet"].apply(clean_text)
tweets["clean_tweet_string"] = tweets["clean_tweet"].apply(lambda x: " ".join(x) )

In [56]:
tweets

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,clean_tweet,clean_tweet_string
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,"[rt, mayasolovely, as, a, woman, you, should, ...",rt mayasolovely as a woman you should not comp...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,"[rt, mleew17, boy, dats, cold, tyga, dwn, bad,...",rt mleew17 boy dats cold tyga dwn bad for cuff...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, yo...",rt urkindofbrand dawg rt 80sbaby4life you ever...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,"[rt, c, g, anderson, viva, based, she, look, l...",rt c g anderson viva based she look like a tranny
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,"[rt, shenikaroberts, the, shit, you, hear, abo...",rt shenikaroberts the shit you hear about me m...
...,...,...,...,...,...,...,...,...
25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,"[you, s, a, muthaf, in, lie, 8220, lifeasking,...",you s a muthaf in lie 8220 lifeasking 20 pearl...
25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an...","[you, ve, gone, and, broke, the, wrong, heart,...",you ve gone and broke the wrong heart baby and...
25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...,"[young, buck, wanna, eat, dat, nigguh, like, i...",young buck wanna eat dat nigguh like i aint fu...
25295,6,0,6,0,1,youu got wild bitches tellin you lies,"[youu, got, wild, bitches, tellin, you, lies]",youu got wild bitches tellin you lies
