In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip


In [2]:
import warnings
import gc,time
#nlp
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style("dark")
eng_stopwords = set(stopwords.words("english"))
warnings.filterwarnings("ignore")

In [3]:
train=pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test=pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
submission=pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
test_labels=pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')

In [4]:
clean_df=train.loc[(train.toxic==0) &  (train.severe_toxic==0) &(train.obscene==0) & (train.threat==0)  &(train.insult==0) &(train.identity_hate==0)]
toxic_df=train.loc[(train.toxic==1)]

#creating test set
clean_test=clean_df.iloc[:28669] # 20 percent of total clean comments equal 28669
toxic_test=toxic_df.iloc[:3059]
test_set=clean_test.append(toxic_test,ignore_index=True).sample(frac=1)# appending 2 dataframes and shuffling them
test_set.drop(['id','severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)
print(test_set.shape
     )

#creating train set
clean_train=clean_df.iloc[28669:]
toxic_train=toxic_df.iloc[3059:]
df=clean_train.append(toxic_train,ignore_index=True).sample(frac=1)

# df=clean_df.append(toxic_df,ignore_index=1).sample(frac=1)# appending 2 dataframes and shuffling them
df.drop(['id','severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)
df.shape


(31728, 2)


(126912, 2)

In [5]:
# Applying a first round of text cleaning techniques

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
# Applying the cleaning function to both test and training datasets
df['comment_text'] = df['comment_text'].apply(lambda x: clean_text(x))
test_set['comment_text']=test_set['comment_text'].apply(lambda x:clean_text(x))

In [6]:
dictionary_clean={0:'clean',1:'toxic'}
df['target_name']=df['toxic'].map(dictionary_clean)
test_set['target_name']=test_set['toxic'].map(dictionary_clean)

# Hashing Vectorizer

In [7]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import  SMOTE


smt = SMOTE(random_state=777, k_neighbors=1)

vec =  HashingVectorizer(stop_words='english', ngram_range=(1,2))

vec_fit=vec.fit_transform(df.comment_text)

clf = LogisticRegressionCV()



X_SMOTE, y_SMOTE = smt.fit_sample(vec_fit, df.toxic)


Using TensorFlow backend.


In [8]:
from collections import Counter
print(Counter(y_SMOTE))

Counter({0: 114677, 1: 114677})


In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=0.1, solver='sag')
scores = cross_val_score(clf, X_SMOTE,y_SMOTE, cv=5,scoring='f1_weighted')

In [10]:
clf.fit(X_SMOTE,y_SMOTE)

LogisticRegression(C=0.1, solver='sag')

In [11]:
from sklearn import metrics

def print_report1(df):
    y_test =  df.toxic
    test_features=vec.transform(df.comment_text)
    y_pred = clf.predict(test_features)
    report = metrics.classification_report(y_test, y_pred,
        target_names=list(df.target_name.unique()))
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report1(test_set)

              precision    recall  f1-score   support

       toxic       0.98      0.86      0.92     28669
       clean       0.39      0.81      0.52      3059

    accuracy                           0.86     31728
   macro avg       0.68      0.84      0.72     31728
weighted avg       0.92      0.86      0.88     31728

accuracy: 0.859


## Testing Time

In [12]:
print(test_set.comment_text[11])
print('\n')
print(test_set.toxic[11])



0


## and what eli5 shows

In [13]:
import eli5
eli5.show_prediction(clf, test_set.comment_text[11], vec=vec,
                     target_names=list(df.target_name.unique()),top=15)
# it shows probability of each of  the 2 classes and then shows which features contributed the most and which
# contributed the least in each class
# top argument shows the  top n features that contibuted to the prediction of each class

Contribution?,Feature
+0.446,edit
+0.262,discussion
+0.259,notable
+0.243,look
+0.213,issue
+0.178,review
+0.160,isnt
+0.145,did
+0.133,matter
+0.107,certainly


## Features By Hashing Vectorizer

In [14]:
from eli5.sklearn import InvertableHashingVectorizer
import numpy as np

In [15]:
ivec = InvertableHashingVectorizer(vec)
sample_size = len(df) // 10
X_sample = np.random.choice(df.comment_text, size=sample_size)
ivec.fit(X_sample)

InvertableHashingVectorizer(vec=HashingVectorizer(ngram_range=(1, 2),
                                                  stop_words='english'))

In [16]:
eli5.show_weights(clf, vec=ivec, top=20,
                  target_names=['clean','toxic'])
#  this shows green words contributed mostly in toxic comments making

Weight?,Feature
+11.934,fuck
+9.772,fucking
+8.450,stupid
+8.309,shit
+7.336,idiot
+6.789,ass …
+5.365,suck
+5.215,bitch
+5.152,asshole
+5.010,hell


# WORD2VEC IS  NOT SUPPORTED


In [17]:
import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In [18]:
# Cleaing the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [19]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=2)

In [20]:
vocabulary = word2vec.wv.vocab

In [21]:
v1 = word2vec.wv['artificial']
print(v1.shape)

(100,)


In [22]:
word2vec.save("word2vec.model")

In [23]:
import eli5
eli5.show_prediction(word2vec, test_set.comment_text[11], vec=vec,
                     target_names=list(df.target_name.unique()),top=15)