In [67]:
import pandas as pd
import re

from textblob import TextBlob
from nltk.stem import WordNetLemmatizer

In [68]:
train_data = pd.read_csv('train.csv')

In [69]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95851 entries, 0 to 95850
Data columns (total 8 columns):
id               95851 non-null int64
comment_text     95851 non-null object
toxic            95851 non-null int64
severe_toxic     95851 non-null int64
obscene          95851 non-null int64
threat           95851 non-null int64
insult           95851 non-null int64
identity_hate    95851 non-null int64
dtypes: int64(7), object(1)
memory usage: 5.9+ MB


In [70]:
train_data[0:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [71]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [72]:
for label in label_columns:
    print(f"Count for {label}: {train_data[label].sum()/95851}")

Count for toxic: 0.09636832166591898
Count for severe_toxic: 0.010067709257076087
Count for obscene: 0.05330147833616759
Count for threat: 0.003182022096796069
Count for insult: 0.04971257472535498
Count for identity_hate: 0.008492347497678689


In [73]:
def remove_punctuation(row_str):
    return re.sub(r"\W", " ", row_str)

In [74]:
def lemmatize(row_str):
    wnl = WordNetLemmatizer()
    return " ".join([wnl.lemmatize(x) for x in row_str.split()])

In [75]:
def get_sentiment_analysis(row_str):
    blob = TextBlob(row_str)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

In [76]:
train_data = train_data.assign(comment_text=train_data.comment_text.apply(remove_punctuation))

In [79]:
def pos_replace(row_str):
    tags = TextBlob(row_str).tags
    tag_replace = ['CC', 'CD', 'DT', 'IN', 'NN', 'NNP', 'NNS', 'PRP', 'PRP$', 'WP']
    return " ".join([x[1] if x[1] in tag_replace else x[0] for x in tags])

In [80]:
def get_polarity(row):
    return row[0]

def get_subjectivity(row):
    return row[1]

In [81]:
def correct_spelling(row_str):
    return str(TextBlob(row_str).correct())

In [None]:
train_data = train_data.assign(comment_text=train_data.comment_text.apply(correct_spelling))

In [53]:
train_data = train_data.assign(comment_text=train_data.comment_text.apply(lemmatize))

In [None]:
train_data = train_data.assign(comment_text=train_data.comment_text.apply(pos_replace))

In [55]:
train_data[0:10]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,Nonsense kiss off geek what I said is true I l...,1,0,0,0,0,0
1,27450690,Please do not vandalize page a you did with th...,0,0,0,0,0,0
2,54037174,Points of interest I removed the point of inte...,0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0
5,82428052,Fried chicken Is dat sum fried chicken,0,0,0,0,0,0
6,87311443,Why can you put English for example on some pl...,0,0,0,0,0,0
7,114749757,Guy Fawkes im a resident in bridgwater and i g...,0,0,0,0,0,0
8,138560519,a far a nickname go this article is embarrassi...,0,0,0,0,0,0
9,139353149,Woodland Meadows Good to hear that you correct...,0,0,0,0,0,0


In [56]:
t = train_data.comment_text.apply(get_sentiment_analysis)

In [58]:
train_data = train_data.assign(polarity=t.apply(get_polarity))
train_data = train_data.assign(subjectivity=t.apply(get_subjectivity))

In [59]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95851 entries, 0 to 95850
Data columns (total 10 columns):
id               95851 non-null int64
comment_text     95851 non-null object
toxic            95851 non-null int64
severe_toxic     95851 non-null int64
obscene          95851 non-null int64
threat           95851 non-null int64
insult           95851 non-null int64
identity_hate    95851 non-null int64
polarity         95851 non-null float64
subjectivity     95851 non-null float64
dtypes: float64(2), int64(7), object(1)
memory usage: 7.3+ MB


In [60]:
train_data[0:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,polarity,subjectivity
0,22256635,Nonsense kiss off geek what I said is true I l...,1,0,0,0,0,0,0.35,0.65
1,27450690,Please do not vandalize page a you did with th...,0,0,0,0,0,0,0.0,0.0
2,54037174,Points of interest I removed the point of inte...,0,0,0,0,0,0,-0.040625,0.771875
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,0.175,0.625
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,-0.075,0.25


In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [62]:
vectorizer = TfidfVectorizer(stop_words='english')
tf_matrix = vectorizer.fit_transform(train_data.comment_text)

In [63]:
tf_matrix.shape

(95851, 4244420)

In [None]:
train_tf_df = pd.SparseDataFrame(tf_matrix)