# Preprocessing

In [10]:
# Let us start by importing the library from the package directory
import pandas as pd

# Read the data
df = pd.read_csv("./train.csv")

In [11]:
# Check the first five rows of our dataset
df.head()

Unnamed: 0,id,text,harsh,extremely_harsh,vulgar,threatening,disrespect,targeted_hate
0,a8be7c5d4527adbbf15f,""", 6 December 2007 (UTC)\nI am interested, not...",0,0,0,0,0,0
1,0b7ca73f388222aad64d,I added about three missing parameters to temp...,0,0,0,0,0,0
2,db934381501872ba6f38,SANDBOX?? \n\nI DID YOUR MADRE DID IN THE SANDBOX,1,0,0,0,0,0
3,228015c4a87c4b1f09a7,"why good sir? Why? \n\nYou, sir, obviously do ...",1,0,1,1,1,0
4,b18f26cfa1408b52e949,"""\n\n Source \n\nIncase I forget, or someone e...",0,0,0,0,0,0


### Remove urls

In [12]:
import re

def clean_url(review_text):
    return re.sub(r'http\S+', ' ', review_text)

df['text'] = df['text'].apply(clean_url)

### Remove html tags

In [13]:
def clean_html_tags(review_text):
    return re.sub('<[^<]+?>', '', review_text)

df['text'] = df['text'].apply(clean_url)

### Remove numbers and punctuation

In [14]:
def clean_non_alphanumeric(review_text):
    return re.sub('[^a-zA-Z]', ' ', review_text)

df['text'] = df['text'].apply(clean_non_alphanumeric)

In [15]:
df['text'].iloc[0]

'     December       UTC  I am interested  not in arguing  but in the policies which resolve our ongoing content dispute  Also  see Wikipedia  WikiProject United States presidential elections for what I ll be working on  Also  the moneybomb closer just self reverted on two different requests  which echoed what I would have requested   I will rephrase     which I didn t see an answer to  building on our agreement that   moneybomb   should not be a redlink  Given the deletion reversion  what should be the outline of the article called   moneybomb   or should it be submitted for AFD again in due time   If the latter  see the previous version of      However  this version will require a detailed answer because any ambiguity will only necessitate clarifying questions          '

### Lowercase the text

In [16]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [17]:
df['text'].iloc[0]

'     december       utc  i am interested  not in arguing  but in the policies which resolve our ongoing content dispute  also  see wikipedia  wikiproject united states presidential elections for what i ll be working on  also  the moneybomb closer just self reverted on two different requests  which echoed what i would have requested   i will rephrase     which i didn t see an answer to  building on our agreement that   moneybomb   should not be a redlink  given the deletion reversion  what should be the outline of the article called   moneybomb   or should it be submitted for afd again in due time   if the latter  see the previous version of      however  this version will require a detailed answer because any ambiguity will only necessitate clarifying questions          '

### Spell Check

In [18]:
# from textblob import TextBlob

# df['text'] = df['text'].apply(lambda x : TextBlob(x).correct())

### Tokenize the text

In [19]:
from nltk.tokenize import word_tokenize

df['text'] = df['text'].apply(word_tokenize)

In [20]:
df['text'].iloc[0]

['december',
 'utc',
 'i',
 'am',
 'interested',
 'not',
 'in',
 'arguing',
 'but',
 'in',
 'the',
 'policies',
 'which',
 'resolve',
 'our',
 'ongoing',
 'content',
 'dispute',
 'also',
 'see',
 'wikipedia',
 'wikiproject',
 'united',
 'states',
 'presidential',
 'elections',
 'for',
 'what',
 'i',
 'll',
 'be',
 'working',
 'on',
 'also',
 'the',
 'moneybomb',
 'closer',
 'just',
 'self',
 'reverted',
 'on',
 'two',
 'different',
 'requests',
 'which',
 'echoed',
 'what',
 'i',
 'would',
 'have',
 'requested',
 'i',
 'will',
 'rephrase',
 'which',
 'i',
 'didn',
 't',
 'see',
 'an',
 'answer',
 'to',
 'building',
 'on',
 'our',
 'agreement',
 'that',
 'moneybomb',
 'should',
 'not',
 'be',
 'a',
 'redlink',
 'given',
 'the',
 'deletion',
 'reversion',
 'what',
 'should',
 'be',
 'the',
 'outline',
 'of',
 'the',
 'article',
 'called',
 'moneybomb',
 'or',
 'should',
 'it',
 'be',
 'submitted',
 'for',
 'afd',
 'again',
 'in',
 'due',
 'time',
 'if',
 'the',
 'latter',
 'see',
 'the',

### Remove Stopwords

In [21]:
# Uncomment this to install stopwords as it might give LookUp error without it

# import nltk   
# nltk.download('stopwords')

In [22]:
# Import stopwords with nltk.
from nltk.corpus import stopwords

stop = stopwords.words('english')

df['text'] = df['text'].apply(lambda x: [item for item in x if item not in stop])

In [28]:
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [23]:
df['text'].iloc[0]

['december',
 'utc',
 'interested',
 'arguing',
 'policies',
 'resolve',
 'ongoing',
 'content',
 'dispute',
 'also',
 'see',
 'wikipedia',
 'wikiproject',
 'united',
 'states',
 'presidential',
 'elections',
 'working',
 'also',
 'moneybomb',
 'closer',
 'self',
 'reverted',
 'two',
 'different',
 'requests',
 'echoed',
 'would',
 'requested',
 'rephrase',
 'see',
 'answer',
 'building',
 'agreement',
 'moneybomb',
 'redlink',
 'given',
 'deletion',
 'reversion',
 'outline',
 'article',
 'called',
 'moneybomb',
 'submitted',
 'afd',
 'due',
 'time',
 'latter',
 'see',
 'previous',
 'version',
 'however',
 'version',
 'require',
 'detailed',
 'answer',
 'ambiguity',
 'necessitate',
 'clarifying',
 'questions']

### Lemmatization

Comparing between Lemmatization and stemming

Stemming just removes or stems the last few characters of a word, often leading to incorrect meanings and spelling. Lemmatization considers the context and converts the word to its meaningful base form, which is called Lemma. Sometimes, the same word can have multiple different Lemmas.

* If you lemmatize the word 'Caring', it would return 'Care'. If you stem, it would return 'Car' and this is erroneous.

* If you lemmatize the word 'Stripes' in verb context, it would return 'Strip'. If you lemmatize it in noun context, it would return 'Stripe'. If you just stem it, it would just return 'Strip'.
* You would get same results whether you lemmatize or stem words such as walking, running, swimming... to walk, run, swim etc.

In [24]:
# Uncomment this to install lemmetizer as it might give LookUp error without it

# import nltk   
# nltk.download('wordnet')

In [25]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()


# POS - Valid options are `"n"` for nouns,`"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"` for satellite adjectives.
# Modularzation needed

def clean_lemmatization_noun(token):
    return [lemma.lemmatize(word = w, pos='n') for w in token]

def clean_lemmatization_verb(token):
    return [lemma.lemmatize(word = w, pos='v') for w in token]

df['text'] = df['text'].apply(clean_lemmatization_noun)
df['text'] = df['text'].apply(clean_lemmatization_verb)
# df['text'] = df.apply(lambda x : clean_lemmatization(df['text'], 'n'), axis=1)
# df['text'] = df.apply(lambda x : clean_lemmatization(df['text'], 'v'), axis=1)
# df['newcolumn'] = df.apply(lambda x: fxy(x['A'], x['B']), axis=1)

In [26]:
df['text'].iloc[0]

['december',
 'utc',
 'interest',
 'argue',
 'policy',
 'resolve',
 'ongoing',
 'content',
 'dispute',
 'also',
 'see',
 'wikipedia',
 'wikiproject',
 'unite',
 'state',
 'presidential',
 'election',
 'work',
 'also',
 'moneybomb',
 'closer',
 'self',
 'revert',
 'two',
 'different',
 'request',
 'echo',
 'would',
 'request',
 'rephrase',
 'see',
 'answer',
 'build',
 'agreement',
 'moneybomb',
 'redlink',
 'give',
 'deletion',
 'reversion',
 'outline',
 'article',
 'call',
 'moneybomb',
 'submit',
 'afd',
 'due',
 'time',
 'latter',
 'see',
 'previous',
 'version',
 'however',
 'version',
 'require',
 'detail',
 'answer',
 'ambiguity',
 'necessitate',
 'clarify',
 'question']

In [27]:
df.head()

Unnamed: 0,id,text,harsh,extremely_harsh,vulgar,threatening,disrespect,targeted_hate
0,a8be7c5d4527adbbf15f,"[december, utc, interest, argue, policy, resol...",0,0,0,0,0,0
1,0b7ca73f388222aad64d,"[add, three, miss, parameter, template, infobo...",0,0,0,0,0,0
2,db934381501872ba6f38,"[sandbox, madre, sandbox]",1,0,0,0,0,0
3,228015c4a87c4b1f09a7,"[good, sir, sir, obviously, comprehend, import...",1,0,1,1,1,0
4,b18f26cfa1408b52e949,"[source, incase, forget, someone, else, want, ...",0,0,0,0,0,0


# Text -> Features

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()