In [None]:
import os
import sys
from importlib import reload
sys.path.append('../src')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
from tqdm import tqdm_notebook
from tqdm import tqdm_pandas

## Study the toxicity label diversity

In [None]:
train = pd.read_csv('../input/jigsaw-toxic-comment-train.csv')

In [None]:
train.head()

In [None]:
train.columns

In [None]:
toxic_types_tr = ['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
toxic_types_tr.sort()
fig, ax = plt.subplots(facecolor='white')
(train[toxic_types_tr].sum()/len(train)*100).sort_values().plot(kind='barh', ax=ax)
plt.grid(alpha=0.5)
plt.ylabel('Percentage %')
plt.xlabel('Toxicity type')
plt.title('Distribution of Toxic Comments')
plt.show()

In [None]:
train_ub = pd.read_csv('../input/jigsaw-unintended-bias-train.csv')

In [None]:
train_ub.head()

In [None]:
train_ub.columns

In [None]:
train_ub[['toxic', 'rating']].head() # check what is the relevance of toxic and rating
train_ub[['toxic', 'funny', 'wow','sad', 'likes', 'disagree']].groupby('toxic').sum().query('likes')

In [None]:
train_ub_columns = np.array(train_ub.columns)
train_ub_columns[train_ub_columns == 'identity_attack'] = 'identity_hate'
train_ub_columns[train_ub_columns == 'severe_toxicity'] = 'severe_toxic'
train_ub.columns = train_ub_columns

In [None]:
train_ub.columns

In [None]:
toxic_types_tr_ub = ['severe_toxic', 'obscene',
       'identity_hate', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'sexual_explicit']
toxic_types_tr_ub.sort()

common_cols = []
for col in toxic_types_tr_ub:
    if col in toxic_types_tr:
        common_cols.append(col)
common_cols

In [None]:
train_ub.fillna(0, inplace=True)

In [None]:
train_ub[toxic_types_tr_ub].describe()

In [None]:
# FIXME: this plot is not accurate as the instances have multiple labels.
fig, ax = plt.subplots(facecolor='white')
df = (train_ub[toxic_types_tr_ub].astype('bool').sum()/len(train_ub)*100).sort_values()
ax = df.plot(kind='barh', figsize=(10,15), ax=ax)
for i, tag in enumerate(df.index):
    if tag in common_cols:
        ax.patches[i].set_color('red')
plt.grid(alpha=0.5)
plt.ylabel('Percentage %')
plt.xlabel('Toxicity type')
plt.title('Distribution of Toxic Comments')
plt.show()

In [None]:
multilabel_instance_count = (train_ub[toxic_types_tr_ub].sum(axis=1) > 1).sum()
print(f"number of multilabel instance: {multilabel_instance_count}")
print(f"total number of instance in the train_ub: {len(train_ub)}")
print(f"percentage of instances with multiple labels: {multilabel_instance_count / len(train_ub) * 100:2.2f}")

In [None]:
# FIXME: this plot is not accurate as the instances have multiple labels.
fig, ax = plt.subplots(facecolor='white')
toxic_types_counts = pd.DataFrame({'train': train[toxic_types_tr].sum(), 'train_ub': train_ub[common_cols].sum()})
toxic_types_counts.plot(kind='bar', ax=ax)
plt.yscale('log')
plt.grid(alpha=0.5)
plt.ylabel('Count')
plt.xlabel('Toxicity type')
plt.title('Distribution of Toxic Comments')
plt.hlines(len(train), -10, 10, label='train size', color='blue', linestyle='--')
plt.hlines(len(train_ub), -10, 10, label='train_ub size', color='orange', linestyle='--')
plt.ylim([0,2.5e6])
plt.show()

In [None]:
# observations:
# train_ub has different proportion of toxic comments that might influence our generalization on the test set

In [None]:
train_tmp = pd.concat([train, pd.Series(np.ones(len(train)), name='toxic_and_non_toxic')], axis=1)
train_tmp['toxic_binary'] = train_tmp.apply(lambda row: row['toxic'] > 0.5, axis=1) # if toxic is binary yields to same value
train_tmp_count = train_tmp.groupby(toxic_types_tr).sum()
train_tmp_count

In [None]:
(train_tmp_count['toxic'] / train_tmp_count['toxic_and_non_toxic']).plot(kind='bar', figsize=(17,3))
plt.title('proportion of each toxicity type with the total instances with same label') # that might be classified as toxic/non-toxic
plt.grid(alpha=0.25)
plt.show()
# conclusion: there are some instances that have a toxicity label but were not identified as toxic
# (perhaps their) toxicity was lower than 0.5!

In [None]:
assert train_tmp_count['toxic'].sum() / train.query('toxic >= 0.5').shape[0] == 1.0

In [None]:
toxic_types_tr

In [None]:
(train_tmp_count['toxic'] / train.query('toxic >= 0.5').shape[0]) \
                                .sort_values(ascending=False).plot(kind='bar', figsize=(17,3))
plt.title('Proportion of each toxicity type with the total instance with same label and classified as toxic (>0.5)')
plt.grid(alpha=0.25)
plt.show()

# observation: 
#  ~35% with none of the labels is considered as toxic
#  ~27% with insult and obscene
#  ~12% with severe_toxic
#  ~8% only insult 
#  ~5% insult, obscene and severe_toxic
#  ~5% identity_hate, insult, and obscene

In [None]:
print(f"number of instance in train:  {len(train):8}")
print(f"number of instance in train_ub: {len(train_ub):7}")

## Study the language diversity

In [None]:
from polyglot.detect import Detector
import re

def clean_text(text):
    text = str(text)
    text = re.sub(r'[0-9"]', '', text)
    text = re.sub(r'#[\S]+\b', '', text)
    text = re.sub(r'@[\S]+\b', '', text)
    text = re.sub(r'https?\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

def lang_detector(text):
    text = clean_text(text)
    rslt = Detector("".join(x for x in text if x.isprintable()), quiet=True)
    return rslt.language.code, rslt.language.confidence

In [None]:
langs = train['comment_text'].apply(lang_detector)
langs = pd.DataFrame(list(langs), columns=['lang', 'confidence'])
train = pd.concat([train, langs], axis=1)

In [None]:
langs_ub = train_ub['comment_text'].apply(lang_detector)
langs_ub = pd.DataFrame(list(langs_ub), columns=['lang', 'confidence'])
train_ub = pd.concat([train_ub, langs_ub], axis=1)

In [None]:
def language_count(train_data):
    lang_list = sorted(list(set(train_data["lang"])))
    counts = [list(train_data["lang"]).count(cont) for cont in lang_list]
    df = pd.DataFrame(np.transpose([lang_list, counts]))
    df.columns = ["Language", "Count"]
    df["Count"] = df["Count"].apply(int)
    return df

In [None]:
def english_vs_non_english(df_count):
    df_en = pd.DataFrame({'English: ': df_count.query("Language == 'en'").Count,
                          'non-English': df_count.query("Language != 'en' and Language != 'un'").Count.sum()}, )
    df_en = df_en.T
    df_en.columns = ['Count'] # use a better way to create df_en
    df_en.sort_values(by='Count').plot(kind='barh')
    # plt.xscale('log')
    plt.grid(alpha=0.25)

In [None]:
def other_langs(df_count):
    # fig, axes= plt.subplots(3,1, figsize=(17,10))
    # df_count.query("Language != 'en' and Language != 'un'").query("Count >= 20") \
    #             .set_index('Language').sort_values(by='Count', ascending=False).plot(kind='bar', ax=axes[0])
    # axes[0].grid(alpha=0.2)
    # df_count.query("Language != 'en' and Language != 'un'").query("Count < 20 and Count >= 10") \
    #             .set_index('Language').sort_values(by='Count', ascending=False).plot(kind='bar', ax=axes[1])
    # plt.subplots_adjust(hspace=0.5)
    # axes[1].grid(alpha=0.2)
    # df_count.query("Language != 'en' and Language != 'un'").query("Count < 10") \
    #             .set_index('Language').sort_values(by='Count', ascending=False).plot(kind='bar', ax=axes[2])
    # plt.subplots_adjust(hspace=0.5)
    # axes[2].grid(alpha=0.2)
    # plt.show()

    df_count.query("Language != 'en' and Language != 'un'").query("Count >= 50") \
                .set_index('Language').sort_values(by='Count', ascending=True).iloc[-8:].plot(kind='barh')
    plt.grid(alpha=0.2)

In [None]:
df_count = language_count(train)
english_vs_non_english(df_count)
other_langs(df_count)

In [None]:
# some cleaning might be useful based on unknown language, or language detected but with low confidence ..
# ..(especially when lang code is used as input to the model)
# 
# train.query("lang == 'en' and confidence < 80")
# train_ub.query("lang == 'en' and confidence < 80 and toxic > 0.5")

In [None]:
df_count_ub = language_count(train_ub)
english_vs_non_english(df_count_ub)
other_langs(df_count_ub)

In [None]:
df_count_ub = language_count(train_ub)
english_vs_non_english(df_count_ub)
other_langs(df_count_ub)

In [None]:
df_count.query("Language == 'un'").Count

In [None]:
# TODO:
# perhaps better to remove the languages that are unknown or detected with low confidence from the train data
# perhaps it is good to check the language detection output with the lang tag in validation and test..
#  .. to understand any potential discrepencies.

In [None]:
train.head()

In [None]:
train_all = pd.concat([
    train[['comment_text', 'toxic']],
    train_ub[['comment_text', 'toxic']].query('toxic==1'),
    train_ub[['comment_text', 'toxic']].query('toxic==0').sample(n=150000, random_state=0)
])

In [None]:
langs = train_all['comment_text'].apply(lang_detector)
langs = pd.DataFrame(list(langs), columns=['lang', 'confidence'])
train_all = pd.concat([train_all.reset_index(drop=True), langs.reset_index(drop=True)], axis=1)
#
df_count = language_count(train_all)
english_vs_non_english(df_count)
other_langs(df_count)

## Study the validation and test 

In [None]:
valid = pd.read_csv('../input/validation.csv', index_col=0)
valid.head()

In [None]:
test = pd.read_csv('../input/test.csv', index_col=0)
test.head()

In [None]:
df_count_va = language_count(valid).set_index('Language')
df_count_te = language_count(test).set_index('Language')

In [None]:
df_count_valid_test = pd.concat([df_count_te, df_count_va], axis=1,)
df_count_valid_test.columns = ['Test', 'Valid']
df_count_valid_test.sort_values(by='Test').plot(kind='bar')
plt.grid(alpha=0.25)
plt.ylabel('Count')
plt.xlabel('Language')
plt.title('Distribution of Languages in Validation and Test')
plt.show()