In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
import nltk
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


In [2]:
for package in ['stopwords','punkt','wordnet']:
    nltk.download(package)
    
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\desla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\desla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\desla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Project 1 - NLP and Text Classification 

### Load Data (Train and Test)

In [3]:
train_df = pd.read_csv("train.csv.zip")

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
# Creating a subset of the data to reduce computational processing time
train_df_sub = train_df.sample(n=100000)
train_df_sub

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
44784,77a723e8d6a1de64,Fuck you you dumb sack of shit,1,1,1,0,1,0
70311,bc1dc26f938d5a99,How about this: Stop using metacritic as a sou...,0,0,0,0,0,0
4639,0c49a1e943413d90,I would suggest Common engineering entrance ex...,0,0,0,0,0,0
67217,b3dc630654433677,—User:Christopher Mann McKay,0,0,0,0,0,0
136139,d835b4ea6a793db3,REDIRECT Talk:Guimbal Cabri G2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
73892,c5b23b92e700163b,"""\n\n Please re-prod article for deletion. In...",0,0,0,0,0,0
9964,1a5fb1907af433ee,"""==:Devbagh beach==\nA tag has been placed on ...",0,0,0,0,0,0
32797,574aa2ad2b420dd9,"""\n\nHello again. I'll respond to your note on...",0,0,0,0,0,0
127667,aad617eb58960d6b,"""\n\n A brownie for you! \n\n sweet for sweet...",0,0,0,0,0,0


In [6]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,comment_text
0,1,Yo bitch Ja Rule is more succesful then you'll...
1,2,== From RfC == \n\n The title is fine as it is...
2,3,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,4,":If you have a look back at the source, the in..."
4,5,I don't anonymously edit articles at all.


### Training models for each class

In [16]:
# Model trained for 'toxic' class

stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=500, stop_words=stop_words, ngram_range=(1,3))
model_svc = SVC()

y_toxic = train_df_sub["toxic"]
X_toxic = train_df_sub["comment_text"]

X_train_tx, X_test_tx, y_train_tx, y_test_tx = train_test_split(X_toxic, y_toxic)

pipe_toxic = Pipeline([ 
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = ["vec_cv"]

pipe_toxic.fit(X_train_tx, y_train_tx.ravel())
pipe_toxic.score(X_test_tx, y_test_tx)

0.9396

In [17]:
# Model trained for 'severe_toxic' class

stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=500, stop_words=stop_words, ngram_range=(1,3))
model_svc = SVC()

y_severe_toxic = train_df_sub["severe_toxic"]
X_severe_toxic = train_df_sub["comment_text"]

X_train_st, X_test_st, y_train_st, y_test_st = train_test_split(X_severe_toxic, y_severe_toxic)

pipe_severe_toxic = Pipeline([ 
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = ["vec_cv"]

pipe_severe_toxic.fit(X_train_st, y_train_st.ravel())
pipe_severe_toxic.score(X_test_st, y_test_st)

0.9906

In [9]:
# Model trained for 'obscene' class
model_svc = SVC()

stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=500, stop_words=stop_words, ngram_range=(1,3))

y_obscene = train_df_sub["obscene"]
X_obscene = train_df_sub["comment_text"]

X_train_o, X_test_o, y_train_o, y_test_o = train_test_split(X_obscene, y_obscene)

pipe_obscene = Pipeline([ 
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = ["vec_cv"]

pipe_obscene.fit(X_train_o, y_train_o.ravel())
pipe_obscene.score(X_test_o, y_test_o)



0.96828

In [7]:
# Model trained for 'threat' class

stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=500, stop_words=stop_words, ngram_range=(1,3))
model_svc = SVC(class_weight='balanced')

y_threat = train_df_sub["threat"]
X_threat = train_df_sub["comment_text"]

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_threat, y_threat)

pipe_threat = Pipeline([ 
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = ["vec_cv"]

pipe_threat.fit(X_train_t, y_train_t.ravel())
pipe_threat.score(X_test_t, y_test_t)



0.95336

In [18]:
# Model trained for 'insult' class

stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=500, stop_words=stop_words, ngram_range=(1,3))
model_svc = SVC()

y_insult = train_df_sub["insult"]
X_insult = train_df_sub["comment_text"]

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_insult, y_insult)

pipe_insult = Pipeline([ 
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = ["vec_cv"]

pipe_insult.fit(X_train_i, y_train_i.ravel())
pipe_insult.score(X_test_i, y_test_i)

0.96536

In [19]:
# Model trained for 'identity_hate' class

stop_words = list(set(stopwords.words('english') + list(string.punctuation)))
vec_tf = TfidfVectorizer(max_features=500, stop_words=stop_words, ngram_range=(1,3))
model_svc = SVC()

y_identity_hate = train_df_sub["identity_hate"]
X_identity_hate = train_df_sub["comment_text"]

X_train_ih, X_test_ih, y_train_ih, y_test_ih = train_test_split(X_identity_hate, y_identity_hate)

pipe_identity_hate = Pipeline([ 
                    ("vect", vec_tf),
                    ("model", model_svc)
])

params = ["vec_cv"]

pipe_identity_hate.fit(X_train_ih, y_train_ih.ravel())
pipe_identity_hate.score(X_test_ih, y_test_ih)

0.99148

### Test Data Predictions

In [10]:
# Trained models applied to the test data to produce predictions
obscene_test = pipe_obscene.predict(test_df['comment_text'])

In [8]:
threat_test = pipe_threat.predict(test_df['comment_text'])

In [20]:
toxic_test = pipe_toxic.predict(test_df['comment_text'])

In [21]:
severe_toxic_test = pipe_severe_toxic.predict(test_df['comment_text'])

In [22]:
insult_test = pipe_insult.predict(test_df['comment_text'])

In [23]:
identity_hate_test = pipe_identity_hate.predict(test_df['comment_text'])

### Compiling Results Together

In [29]:
# Combining predictions into a data frame in the proper format

new_data = {'obscene': obscene_test, 'threat': threat_test, 'toxic': toxic_test, 'severe_toxic': severe_toxic_test,'identity_hate': identity_hate_test, 'insult': insult_test}
final_df = pd.DataFrame(new_data)
final_df['id'] = test_df['id']
final_df = final_df[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

### Final csv output

In [28]:
# Writing the final results into a csv file

final_df.to_csv('out.csv', index=False)