# Preprocessing

In [1]:
# Let us start by importing the library from the package directory
import pandas as pd

# Read the data
read_data = pd.read_csv("./train.csv")

In [2]:
# Check the first five rows of our dataset
read_data.head()

Unnamed: 0,id,text,harsh,extremely_harsh,vulgar,threatening,disrespect,targeted_hate
0,a8be7c5d4527adbbf15f,""", 6 December 2007 (UTC)\nI am interested, not...",0,0,0,0,0,0
1,0b7ca73f388222aad64d,I added about three missing parameters to temp...,0,0,0,0,0,0
2,db934381501872ba6f38,SANDBOX?? \n\nI DID YOUR MADRE DID IN THE SANDBOX,1,0,0,0,0,0
3,228015c4a87c4b1f09a7,"why good sir? Why? \n\nYou, sir, obviously do ...",1,0,1,1,1,0
4,b18f26cfa1408b52e949,"""\n\n Source \n\nIncase I forget, or someone e...",0,0,0,0,0,0


### Remove urls

In [3]:
import re

def clean_url(review_text):
    return re.sub(r'http\S+', ' ', review_text)

read_data['clean_msg'] = read_data['text'].apply(clean_url)

### Remove numbers and punctuation

In [4]:
def clean_non_alphanumeric(review_text):
    return re.sub('[^a-zA-Z]', ' ', review_text)

read_data['clean_msg'] = read_data['clean_msg'].apply(clean_non_alphanumeric)

In [5]:
read_data['text'].iloc[0]

'", 6 December 2007 (UTC)\nI am interested, not in arguing, but in the policies which resolve our ongoing content dispute. Also, see Wikipedia: WikiProject United States presidential elections for what I\'ll be working on. Also, the moneybomb closer just self-reverted on two different requests, which echoed what I would have requested.  I will rephrase #3, which I didn\'t see an answer to, building on our agreement that ""moneybomb"" should not be a redlink: Given the deletion reversion, what should be the outline of the article called ""moneybomb"" or should it be submitted for AFD again in due time? (If the latter, see the previous version of #3.) However, this version will require a detailed answer because any ambiguity will only necessitate clarifying questions.   22:32"'

In [6]:
read_data['clean_msg'].iloc[0]

'     December       UTC  I am interested  not in arguing  but in the policies which resolve our ongoing content dispute  Also  see Wikipedia  WikiProject United States presidential elections for what I ll be working on  Also  the moneybomb closer just self reverted on two different requests  which echoed what I would have requested   I will rephrase     which I didn t see an answer to  building on our agreement that   moneybomb   should not be a redlink  Given the deletion reversion  what should be the outline of the article called   moneybomb   or should it be submitted for AFD again in due time   If the latter  see the previous version of      However  this version will require a detailed answer because any ambiguity will only necessitate clarifying questions          '

### Lowecase the text

In [7]:
read_data['clean_msg'] = read_data['clean_msg'].apply(lambda x: x.lower())

In [8]:
read_data['clean_msg'].iloc[0]

'     december       utc  i am interested  not in arguing  but in the policies which resolve our ongoing content dispute  also  see wikipedia  wikiproject united states presidential elections for what i ll be working on  also  the moneybomb closer just self reverted on two different requests  which echoed what i would have requested   i will rephrase     which i didn t see an answer to  building on our agreement that   moneybomb   should not be a redlink  given the deletion reversion  what should be the outline of the article called   moneybomb   or should it be submitted for afd again in due time   if the latter  see the previous version of      however  this version will require a detailed answer because any ambiguity will only necessitate clarifying questions          '

### Tokenize the text

In [9]:
from nltk.tokenize import word_tokenize

read_data['clean_msg'] = read_data.apply(lambda row: word_tokenize(row['clean_msg']), axis=1)

In [10]:
read_data['clean_msg'].iloc[0]

['december',
 'utc',
 'i',
 'am',
 'interested',
 'not',
 'in',
 'arguing',
 'but',
 'in',
 'the',
 'policies',
 'which',
 'resolve',
 'our',
 'ongoing',
 'content',
 'dispute',
 'also',
 'see',
 'wikipedia',
 'wikiproject',
 'united',
 'states',
 'presidential',
 'elections',
 'for',
 'what',
 'i',
 'll',
 'be',
 'working',
 'on',
 'also',
 'the',
 'moneybomb',
 'closer',
 'just',
 'self',
 'reverted',
 'on',
 'two',
 'different',
 'requests',
 'which',
 'echoed',
 'what',
 'i',
 'would',
 'have',
 'requested',
 'i',
 'will',
 'rephrase',
 'which',
 'i',
 'didn',
 't',
 'see',
 'an',
 'answer',
 'to',
 'building',
 'on',
 'our',
 'agreement',
 'that',
 'moneybomb',
 'should',
 'not',
 'be',
 'a',
 'redlink',
 'given',
 'the',
 'deletion',
 'reversion',
 'what',
 'should',
 'be',
 'the',
 'outline',
 'of',
 'the',
 'article',
 'called',
 'moneybomb',
 'or',
 'should',
 'it',
 'be',
 'submitted',
 'for',
 'afd',
 'again',
 'in',
 'due',
 'time',
 'if',
 'the',
 'latter',
 'see',
 'the',

### Remove Stopwords

In [11]:
# Uncomment this to install stopwords as it might give LookUp error without it

# import nltk   
# nltk.download('stopwords')

In [12]:
# Import stopwords with nltk.
from nltk.corpus import stopwords

stop = stopwords.words('english')

read_data['clean_msg'] = read_data['clean_msg'].apply(lambda x: [item for item in x if item not in stop])

In [13]:
read_data['clean_msg'].iloc[0]

['december',
 'utc',
 'interested',
 'arguing',
 'policies',
 'resolve',
 'ongoing',
 'content',
 'dispute',
 'also',
 'see',
 'wikipedia',
 'wikiproject',
 'united',
 'states',
 'presidential',
 'elections',
 'working',
 'also',
 'moneybomb',
 'closer',
 'self',
 'reverted',
 'two',
 'different',
 'requests',
 'echoed',
 'would',
 'requested',
 'rephrase',
 'see',
 'answer',
 'building',
 'agreement',
 'moneybomb',
 'redlink',
 'given',
 'deletion',
 'reversion',
 'outline',
 'article',
 'called',
 'moneybomb',
 'submitted',
 'afd',
 'due',
 'time',
 'latter',
 'see',
 'previous',
 'version',
 'however',
 'version',
 'require',
 'detailed',
 'answer',
 'ambiguity',
 'necessitate',
 'clarifying',
 'questions']

In [14]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

def clean_lemmatization(token):
    return [lemma.lemmatize(word = w, pos='v') for w in token]

read_data['clean_msg'] = read_data['clean_msg'].apply(clean_lemmatization)

In [15]:
read_data['clean_msg'].iloc[0]

['december',
 'utc',
 'interest',
 'argue',
 'policies',
 'resolve',
 'ongoing',
 'content',
 'dispute',
 'also',
 'see',
 'wikipedia',
 'wikiproject',
 'unite',
 'state',
 'presidential',
 'elections',
 'work',
 'also',
 'moneybomb',
 'closer',
 'self',
 'revert',
 'two',
 'different',
 'request',
 'echo',
 'would',
 'request',
 'rephrase',
 'see',
 'answer',
 'build',
 'agreement',
 'moneybomb',
 'redlink',
 'give',
 'deletion',
 'reversion',
 'outline',
 'article',
 'call',
 'moneybomb',
 'submit',
 'afd',
 'due',
 'time',
 'latter',
 'see',
 'previous',
 'version',
 'however',
 'version',
 'require',
 'detail',
 'answer',
 'ambiguity',
 'necessitate',
 'clarify',
 'question']