In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [2]:
df_toxic = pd.read_csv ("data/train.csv", sep=",")

In [3]:
df_toxic

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [4]:
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
#nlp = spacy.load('en_core_web_sm')
#stop_words = spacy.lang.en.stop_words.STOP_WORDS
stop_words=""

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [5]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [6]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [9]:
from sklearn.model_selection import train_test_split

X = df_toxic['comment_text'] # the features we want to analyze
ylabels = df_toxic['toxic'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.5)

In [10]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
modelLR = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([('preprocessing', tfidf_vector),
                 ('regression-ML', modelLR)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('preprocessing',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_p...
                                 tokenizer=<function spacy_tokenizer at 0x7f0cd88abc80>,
                                 use_idf=True, vocabulary=None)),
                ('regression-ML',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit

In [11]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)
print(predicted)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

[0 0 0 ... 0 0 0]
Logistic Regression Accuracy: 0.9553305091118742
Logistic Regression Precision: 0.9194889474751572
Logistic Regression Recall: 0.5887547071808856


In [12]:
#Evaluación del rendimiento del clasificador
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predicted)
print(confusion_matrix)
#Print de la matriz de confusión
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted))

[[71688   397]
 [ 3167  4534]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     72085
           1       0.92      0.59      0.72      7701

    accuracy                           0.96     79786
   macro avg       0.94      0.79      0.85     79786
weighted avg       0.95      0.96      0.95     79786



In [13]:
def printNMostInformative(vectorizer, model, N):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(model.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)

In [15]:
printNMostInformative(tfidf_vector, modelLR, 20)

Class 1 best: 
(-5.239109281750547, 'thank')
(-3.3751269117458955, 'if')
(-3.29608608986991, 'please')
(-3.0757682229082848, 'redirect')
(-2.8681006814164403, 'for')
(-2.8352630869505657, 'may')
(-2.761485891503433, 'source')
(-2.621041663407331, 'talk')
(-2.5975803851831496, 'continue')
(-2.5505298482245498, 'sorry')
(-2.3957473660726234, 'interest')
(-2.372381901700311, 'at')
(-2.287466338592273, 'utc')
(-2.270327853895497, 'there')
(-2.2695964327562788, 'but')
(-2.2231286979655103, 'help')
(-2.2077327455857207, 'the')
(-2.1076965808023584, 'welcome')
(-2.0407431961982456, 'consider')
(-2.0232181285594795, 'would')
Class 2 best: 
(17.653727021354747, 'fuck')
(11.346723163838455, 'shit')
(10.924362732885838, 'idiot')
(10.710048621668266, 'stupid')
(9.495524904393971, 'suck')
(8.079788561738146, 'ass')
(7.785793057278761, 'bitch')
(7.651744956959871, 'asshole')
(7.4783725703673545, 'crap')
(7.24608993211174, 'bullshit')
(7.17540316100395, 'faggot')
(6.851190925570692, 'moron')
(6.59729