In [1]:
import os
os.chdir(r'E:\Simplilearn\Cohort 3 - Jan\PG DS - NLP _ Jul 25 - Aug 23 _ Shanti Swaroop (Cohort 3)\Project\Toxic Wiki')

### Data load 

In [2]:
import pandas as pd
import numpy as np

from string import punctuation 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize


In [3]:
train_data = pd.read_csv('train.csv')

In [89]:
train_data.columns

Index(['id', 'comment_text', 'toxic'], dtype='object')

In [90]:
train_data.toxic.value_counts(dropna=False)

0    4563
1     437
Name: toxic, dtype: int64

### Data Cleanup

In [33]:
def do_cleanup(data_raw, context_stop_words=['wiki', 'wikipedia', 'page', 'edit', 'article']):
    
    # lower
    data_raw['comment_text'] = data_raw.comment_text.str.lower()
    
    #remove IP address
    data_raw['comment_text'] = data_raw.comment_text.str.replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '',)
    
    #remove string literals
    data_raw['comment_text'] = data_raw.comment_text.str.replace("[\n\r\t]", '')
    
    #remove numbers
    data_raw['comment_text'] = data_raw.comment_text.str.replace("\d", '')
        
    #remove email adress
    data_raw['comment_text'] = data_raw.comment_text.str.replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '')
    
    #remove punctaitions and special chracters
    data_raw['comment_text'] = data_raw.comment_text.str.replace("[^\w\s]",'')
   
        
    #remove stop words
    stop_words = list(punctuation) + stopwords.words('english') + context_stop_words
    
    clean_text=[]
    for index in data_raw.index:
        word_tokens = word_tokenize(data_raw['comment_text'][index])
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        clean_text.append(" ".join(filtered_sentence[0:]))

    data_raw['clean_text'] = clean_text
    

In [125]:
train_data.index

RangeIndex(start=0, stop=5000, step=1)

In [34]:
do_cleanup(train_data)

## Data Term Analysis


In [35]:
all_comments = train_data.clean_text.values

In [36]:
all_words=[]
for doc in all_comments:
    words = word_tokenize(doc) 
    all_words.extend(words)

In [37]:
from collections import Counter
c = Counter(all_words)

In [38]:
c.most_common(n=50)

[('talk', 1060),
 ('please', 993),
 ('would', 962),
 ('one', 846),
 ('like', 832),
 ('dont', 792),
 ('ass', 708),
 ('also', 633),
 ('think', 628),
 ('fuck', 626),
 ('see', 619),
 ('know', 594),
 ('im', 546),
 ('use', 539),
 ('name', 531),
 ('people', 530),
 ('may', 530),
 ('articles', 527),
 ('time', 469),
 ('even', 400),
 ('make', 388),
 ('information', 383),
 ('deletion', 377),
 ('suck', 374),
 ('thanks', 372),
 ('good', 370),
 ('well', 369),
 ('could', 367),
 ('get', 365),
 ('want', 364),
 ('mexicans', 362),
 ('editing', 342),
 ('way', 336),
 ('edits', 331),
 ('help', 327),
 ('new', 325),
 ('first', 321),
 ('pages', 321),
 ('must', 316),
 ('sources', 316),
 ('user', 310),
 ('need', 308),
 ('say', 305),
 ('thank', 300),
 ('really', 299),
 ('many', 299),
 ('deleted', 296),
 ('source', 295),
 ('used', 282),
 ('image', 282)]

### Test train split

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
final_data = train_data[['clean_text', 'toxic']]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(final_data[['clean_text']], final_data.toxic, random_state=123, test_size=.3)

## TF-IDF Vectorizer

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
vect = TfidfVectorizer(max_features=4000)

In [81]:
X_train_tf = vect.fit_transform(X_train.clean_text.values)

In [99]:
X_test_tf = vect.transform(X_test.clean_text.values)

## SVC Linear

In [108]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report

In [85]:
sv = SVC(kernel='linear')

In [86]:
svclf = sv.fit(X_train_tf, y_train)

## Train Score

In [None]:
pred_train = svclf.predict(X_train_tf)

In [117]:
print(f'\t\t Train Classifiction Report:\n\n {classification_report(y_train, pred_train)}')

		 Train Classifiction Report:

               precision    recall  f1-score   support

           0       0.97      1.00      0.98      3192
           1       0.99      0.64      0.78       308

    accuracy                           0.97      3500
   macro avg       0.98      0.82      0.88      3500
weighted avg       0.97      0.97      0.96      3500



## Test score

In [100]:
pred_test = svclf.predict(X_test_tf)

In [116]:
print(f'\t\t Test Classifiction Report:\n\n {classification_report(y_test, pred_test)}')

		 Test Classifiction Report:

               precision    recall  f1-score   support

           0       0.95      1.00      0.97      1371
           1       0.95      0.43      0.60       129

    accuracy                           0.95      1500
   macro avg       0.95      0.72      0.78      1500
weighted avg       0.95      0.95      0.94      1500



## Balanced 

In [118]:
svw = SVC(kernel='linear', class_weight='balanced')

In [119]:
svwclf = sv.fit(X_train_tf, y_train)

In [120]:
pred_test_w = svwclf.predict(X_test_tf)

In [121]:
print(f'\t\t Test Classifiction Report:\n\n {classification_report(y_test, pred_test_w)}')

		 Test Classifiction Report:

               precision    recall  f1-score   support

           0       0.95      1.00      0.97      1371
           1       0.95      0.43      0.60       129

    accuracy                           0.95      1500
   macro avg       0.95      0.72      0.78      1500
weighted avg       0.95      0.95      0.94      1500



## Hyper Param Tuning

In [122]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [127]:
params = {'C': [.01, .1, 1, 10, 100], 'kernel': ['rbf', 'linear']}

In [131]:
kfold = StratifiedKFold(n_splits=5)

In [134]:
gsv = GridSearchCV(svw, params, cv=kfold, scoring='recall')

In [135]:
gsv.fit(X_train_tf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'kernel': ['rbf', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=0)

In [136]:
gsv.best_estimator_

SVC(C=100, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [137]:
gsv_pred_test = gsv.predict(X_test_tf)

In [138]:
print(f'\t\t Test Classifiction Report:\n\n {classification_report(y_test, gsv_pred_test)}')

		 Test Classifiction Report:

               precision    recall  f1-score   support

           0       0.96      0.90      0.93      1371
           1       0.37      0.62      0.46       129

    accuracy                           0.87      1500
   macro avg       0.66      0.76      0.69      1500
weighted avg       0.91      0.87      0.89      1500



array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [141]:
X_test['predicted'] = gsv_pred_test

In [146]:
pred_toxic = X_test[X_test.predicted == 1]['clean_text'].values.tolist()

In [153]:
tokens = []
for sent in pred_toxic:
    wt = word_tokenize(sent) 
    tokens.extend(wt)

In [155]:
tc = Counter(tokens)

In [158]:
tc.most_common(n=50)

[('assfuck', 277),
 ('nigger', 185),
 ('fuck', 24),
 ('like', 17),
 ('right', 15),
 ('dont', 15),
 ('im', 13),
 ('go', 12),
 ('fucking', 11),
 ('life', 11),
 ('stop', 11),
 ('think', 10),
 ('ass', 10),
 ('would', 10),
 ('live', 10),
 ('really', 9),
 ('gay', 9),
 ('wing', 9),
 ('talk', 9),
 ('one', 8),
 ('people', 8),
 ('take', 8),
 ('hey', 8),
 ('get', 8),
 ('hell', 8),
 ('hate', 8),
 ('youre', 7),
 ('saturday', 7),
 ('night', 7),
 ('show', 7),
 ('truth', 6),
 ('sure', 6),
 ('even', 6),
 ('know', 6),
 ('going', 6),
 ('ive', 6),
 ('sketches', 6),
 ('award', 6),
 ('ill', 6),
 ('way', 6),
 ('little', 6),
 ('paul', 6),
 ('tibbit', 6),
 ('dead', 5),
 ('stupid', 5),
 ('man', 5),
 ('asshole', 5),
 ('rather', 5),
 ('guy', 5),
 ('someone', 5)]