## Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import os
import re
import multiprocessing as mp
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from wordcloud import WordCloud
from nltk.corpus import stopwords # Import the stop word list
import sys
from joblib import Parallel, delayed  
from textblob import TextBlob, Word
from nltk.corpus import wordnet as wn
from sklearn.utils import resample

pd.set_option('display.max_rows', 100) # to look at more rows of data later
pd.set_option('display.max_columns', 100) # to expand columns view so that all can be seen later

Steps on cleaning the data.

1. Load dataset
2. Fill all NaN values with 0 (since labels follow float values)
3. Lemmatize and remove stop words from the comments.
4. Store the data together with the labels information in a new dataframe, removing any duplicates and blank comments.

In [2]:
data = pd.read_csv('../dataset/train.csv')

In [3]:
# Fill null values with 0
data.fillna(0,inplace=True)

In [119]:
data[data['target'] >= 0.5].count()

id                                     144334
target                                 144334
comment_text                           144334
severe_toxicity                        144334
obscene                                144334
identity_attack                        144334
insult                                 144334
threat                                 144334
asian                                  144334
atheist                                144334
bisexual                               144334
black                                  144334
buddhist                               144334
christian                              144334
female                                 144334
heterosexual                           144334
hindu                                  144334
homosexual_gay_or_lesbian              144334
intellectual_or_learning_disability    144334
jewish                                 144334
latino                                 144334
male                              

In [5]:
stops = set(stopwords.words('english'))

def replaceMultiple(mainString, toBeReplaced, newString):
    '''
    Replace a set of multiple sub strings with a new string in main string.
    
    Parameters
    ----------
    mainString: str
        Main text that contains substrings to be replaced
    
    toBeReplaced: list
        List of substrings that are to be replaced in the main string
        
    newString: str
        String that is replacing the substrings
    
    Returns
    -------
    String
        Main string that has the strings replaced
    '''
    # Iterate over the strings to be replaced
    for elem in toBeReplaced :
        # Check if string is in the main string
        if elem in mainString :
            # Replace the string
            mainString = mainString.replace(elem, newString)
    
    return  mainString

# Pre-processing of text

def text_to_words(text):
    """
    Convert chunks of raw texts to a string of words.
    
    Parameters
    ----------
    text: str
        Text that is required to be converted.
    
    Returns
    -------
    String
        String that contains text that has been processed.
    """
    num_cores = multiprocessing.cpu_count()
    
    # Remove URL links
    comment = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    
    # Remove HTML tags.
    comment = BeautifulSoup(comment).get_text()
    
    # Remove non-letters. Since we are looking at mental issues, numbers will not be very relevant in this case.
    letters_only = re.sub("[^a-zA-Z]", " ", comment)
    
    # Lemmatize words
    words = lemmatize_with_postag(letters_only.lower())
    
    # Remove stop words.
    meaningful_words = Parallel(n_jobs=num_cores)(delayed(stopwords_removal)(w) for w in words)
    
    # Join the words back into one string separated by space
    return(" ".join(list(filter(None, meaningful_words))))

def stopwords_removal(word):
    if word not in stops:
        return word
    else:
        pass
    
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    return [wd.lemmatize(tag) for wd, tag in words_and_tags]
    

In [195]:
## Rewriting this as a function to clean data by batches of 50000 (Total of 38 batches to fully clean the data)

def clean_data(batch_num):

    cleaned_text = []
    total_text = data.shape[0]
    start_index = (batch_num - 1) * 50000
    end_index = batch_num * 50000
    if end_index > total_text:
        end_index = total_text
    for i in range(start_index,end_index):
        # Convert review to words, then append to clean_train_reviews.
        cleaned_text.append(text_to_words(data['comment_text'][i]))

        # If the index is divisible by 1000, print a message
        if (i + 1) % 5000 == 0:
            print(f'Review {i + 1} of {total_text}.')
        i += 1
    print(f'Batch {batch_num} cleaning completed.')
    
    cleaned_text_df = pd.DataFrame(cleaned_text,columns=['comment'])
    cleaned_text_df['target'] = list(data['target'][start_index:end_index])
    
    return cleaned_text_df


In [62]:
cleaned_batch_1 = clean_data(1)

Review 5000 of 1804874.
Review 10000 of 1804874.
Review 15000 of 1804874.
Review 20000 of 1804874.
Review 25000 of 1804874.
Review 30000 of 1804874.
Review 35000 of 1804874.
Review 40000 of 1804874.
Review 45000 of 1804874.
Review 50000 of 1804874.
Batch 1 cleaning completed.


In [64]:
## Save to csv to avoid losing data
cleaned_batch_1.to_csv('../dataset/cleaned_batch_1.csv',index=False)

In [65]:
cleaned_batch_2 = clean_data(2)
cleaned_batch_2.to_csv('../dataset/cleaned_batch_2.csv',index=False)

Review 55000 of 1804874.
Review 60000 of 1804874.
Review 65000 of 1804874.
Review 70000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 75000 of 1804874.
Review 80000 of 1804874.
Review 85000 of 1804874.
Review 90000 of 1804874.
Review 95000 of 1804874.
Review 100000 of 1804874.
Batch 2 cleaning completed.


In [198]:
# Issue with function when adding target data, reapplying it manually
cleaned_batch_2['target'] = list(data['target'][50000:100000])

In [93]:
cleaned_batch_3 = clean_data(3)
cleaned_batch_3.to_csv('../dataset/cleaned_batch_3.csv',index=False)

Review 105000 of 1804874.
Review 110000 of 1804874.
Review 115000 of 1804874.
Review 120000 of 1804874.
Review 125000 of 1804874.
Review 130000 of 1804874.
Review 135000 of 1804874.
Review 140000 of 1804874.
Review 145000 of 1804874.
Review 150000 of 1804874.
Batch 3 cleaning completed.


In [199]:
# Issue with function when adding target data, reapplying it manually
cleaned_batch_3['target'] = list(data['target'][100000:150000])

In [94]:
cleaned_batch_4 = clean_data(4)
cleaned_batch_4.to_csv('../dataset/cleaned_batch_4.csv',index=False)

Review 155000 of 1804874.
Review 160000 of 1804874.
Review 165000 of 1804874.
Review 170000 of 1804874.
Review 175000 of 1804874.
Review 180000 of 1804874.
Review 185000 of 1804874.
Review 190000 of 1804874.
Review 195000 of 1804874.
Review 200000 of 1804874.
Batch 4 cleaning completed.


In [200]:
# Issue with function when adding target data, reapplying it manually
cleaned_batch_4['target'] = list(data['target'][150000:200000])

In [118]:
cleaned_batch_5 = clean_data(5)
cleaned_batch_5.to_csv('../dataset/cleaned_batch_5.csv',index=False)

Review 205000 of 1804874.
Review 210000 of 1804874.
Review 215000 of 1804874.
Review 220000 of 1804874.
Review 225000 of 1804874.
Review 230000 of 1804874.
Review 235000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 240000 of 1804874.
Review 245000 of 1804874.
Review 250000 of 1804874.
Batch 5 cleaning completed.


In [201]:
# Issue with function when adding target data, reapplying it manually
cleaned_batch_5['target'] = list(data['target'][200000:250000])

In [147]:
cleaned_batch_6 = clean_data(6)
cleaned_batch_6.to_csv('../dataset/cleaned_batch_6.csv',index=False)

Review 255000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 260000 of 1804874.
Review 265000 of 1804874.
Review 270000 of 1804874.
Review 275000 of 1804874.
Review 280000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 285000 of 1804874.
Review 290000 of 1804874.
Review 295000 of 1804874.
Review 300000 of 1804874.
Batch 6 cleaning completed.


In [202]:
# Issue with function when adding target data, reapplying it manually
cleaned_batch_6['target'] = list(data['target'][250000:300000])

In [265]:
cleaned_batch_7 = clean_data(7)
cleaned_batch_7.to_csv('../dataset/cleaned_batch_7.csv',index=False)

Review 305000 of 1804874.
Review 310000 of 1804874.
Review 315000 of 1804874.
Review 320000 of 1804874.
Review 325000 of 1804874.
Review 330000 of 1804874.
Review 335000 of 1804874.
Review 340000 of 1804874.
Review 345000 of 1804874.
Review 350000 of 1804874.
Batch 7 cleaning completed.


In [284]:
cleaned_batch_8 = clean_data(8)
cleaned_batch_8.to_csv('../dataset/cleaned_batch_8.csv',index=False)

  ' Beautiful Soup.' % markup)


Review 355000 of 1804874.
Review 360000 of 1804874.
Review 365000 of 1804874.
Review 370000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 375000 of 1804874.
Review 380000 of 1804874.
Review 385000 of 1804874.
Review 390000 of 1804874.
Review 395000 of 1804874.
Review 400000 of 1804874.
Batch 8 cleaning completed.


In [304]:
cleaned_batch_9 = clean_data(9)
cleaned_batch_9.to_csv('../dataset/cleaned_batch_9.csv',index=False)

Review 405000 of 1804874.
Review 410000 of 1804874.
Review 415000 of 1804874.
Review 420000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 425000 of 1804874.
Review 430000 of 1804874.
Review 435000 of 1804874.
Review 440000 of 1804874.
Review 445000 of 1804874.
Review 450000 of 1804874.
Batch 9 cleaning completed.


In [305]:
cleaned_batch_10 = clean_data(10)
cleaned_batch_10.to_csv('../dataset/cleaned_batch_10.csv',index=False)

Review 455000 of 1804874.
Review 460000 of 1804874.
Review 465000 of 1804874.
Review 470000 of 1804874.
Review 475000 of 1804874.
Review 480000 of 1804874.
Review 485000 of 1804874.
Review 490000 of 1804874.
Review 495000 of 1804874.
Review 500000 of 1804874.
Batch 10 cleaning completed.


In [306]:
cleaned_batch_11 = clean_data(11)
cleaned_batch_11.to_csv('../dataset/cleaned_batch_11.csv',index=False)

Review 505000 of 1804874.
Review 510000 of 1804874.
Review 515000 of 1804874.
Review 520000 of 1804874.
Review 525000 of 1804874.
Review 530000 of 1804874.
Review 535000 of 1804874.
Review 540000 of 1804874.
Review 545000 of 1804874.
Review 550000 of 1804874.
Batch 11 cleaning completed.


In [307]:
cleaned_batch_12 = clean_data(12)
cleaned_batch_12.to_csv('../dataset/cleaned_batch_12.csv',index=False)

Review 555000 of 1804874.
Review 560000 of 1804874.
Review 565000 of 1804874.
Review 570000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 575000 of 1804874.
Review 580000 of 1804874.
Review 585000 of 1804874.
Review 590000 of 1804874.
Review 595000 of 1804874.
Review 600000 of 1804874.
Batch 12 cleaning completed.


In [308]:
cleaned_batch_13 = clean_data(13)
cleaned_batch_13.to_csv('../dataset/cleaned_batch_13.csv',index=False)

Review 605000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 610000 of 1804874.
Review 615000 of 1804874.
Review 620000 of 1804874.
Review 625000 of 1804874.
Review 630000 of 1804874.
Review 635000 of 1804874.
Review 640000 of 1804874.
Review 645000 of 1804874.
Review 650000 of 1804874.
Batch 13 cleaning completed.


In [333]:
cleaned_batch_14 = clean_data(14)
cleaned_batch_14.to_csv('../dataset/cleaned_batch_14.csv',index=False)

Review 655000 of 1804874.
Review 660000 of 1804874.
Review 665000 of 1804874.
Review 670000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 675000 of 1804874.
Review 680000 of 1804874.
Review 685000 of 1804874.
Review 690000 of 1804874.
Review 695000 of 1804874.
Review 700000 of 1804874.
Batch 14 cleaning completed.


In [334]:
cleaned_batch_15 = clean_data(15)
cleaned_batch_15.to_csv('../dataset/cleaned_batch_15.csv',index=False)

Review 705000 of 1804874.
Review 710000 of 1804874.
Review 715000 of 1804874.
Review 720000 of 1804874.
Review 725000 of 1804874.
Review 730000 of 1804874.
Review 735000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 740000 of 1804874.
Review 745000 of 1804874.
Review 750000 of 1804874.
Batch 15 cleaning completed.


In [335]:
cleaned_batch_16 = clean_data(16)
cleaned_batch_16.to_csv('../dataset/cleaned_batch_16.csv',index=False)

Review 755000 of 1804874.
Review 760000 of 1804874.
Review 765000 of 1804874.
Review 770000 of 1804874.
Review 775000 of 1804874.
Review 780000 of 1804874.
Review 785000 of 1804874.
Review 790000 of 1804874.
Review 795000 of 1804874.
Review 800000 of 1804874.
Batch 16 cleaning completed.


In [336]:
cleaned_batch_17 = clean_data(17)
cleaned_batch_17.to_csv('../dataset/cleaned_batch_17.csv',index=False)

Review 805000 of 1804874.
Review 810000 of 1804874.
Review 815000 of 1804874.
Review 820000 of 1804874.
Review 825000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 830000 of 1804874.
Review 835000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 840000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 845000 of 1804874.
Review 850000 of 1804874.
Batch 17 cleaning completed.


In [361]:
cleaned_batch_18 = clean_data(18)
cleaned_batch_18.to_csv('../dataset/cleaned_batch_18.csv',index=False)

  ' Beautiful Soup.' % markup)


Review 855000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 860000 of 1804874.
Review 865000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 870000 of 1804874.
Review 875000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 880000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 885000 of 1804874.
Review 890000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 895000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 900000 of 1804874.
Batch 18 cleaning completed.


In [380]:
cleaned_batch_19 = clean_data(19)
cleaned_batch_19.to_csv('../dataset/cleaned_batch_19.csv',index=False)

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 905000 of 1804874.
Review 910000 of 1804874.
Review 915000 of 1804874.
Review 920000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 925000 of 1804874.
Review 930000 of 1804874.
Review 935000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 940000 of 1804874.
Review 945000 of 1804874.
Review 950000 of 1804874.
Batch 19 cleaning completed.


In [381]:
cleaned_batch_20 = clean_data(20)
cleaned_batch_20.to_csv('../dataset/cleaned_batch_20.csv',index=False)

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 955000 of 1804874.
Review 960000 of 1804874.
Review 965000 of 1804874.
Review 970000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 975000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 980000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 985000 of 1804874.
Review 990000 of 1804874.
Review 995000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1000000 of 1804874.
Batch 20 cleaning completed.


In [386]:
cleaned_batch_21 = clean_data(21)
cleaned_batch_21.to_csv('../dataset/cleaned_batch_21.csv',index=False)

Review 1005000 of 1804874.
Review 1010000 of 1804874.
Review 1015000 of 1804874.
Review 1020000 of 1804874.
Review 1025000 of 1804874.
Review 1030000 of 1804874.
Review 1035000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1040000 of 1804874.
Review 1045000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1050000 of 1804874.
Batch 21 cleaning completed.


In [387]:
cleaned_batch_22 = clean_data(22)
cleaned_batch_22.to_csv('../dataset/cleaned_batch_22.csv',index=False)

Review 1055000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1060000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1065000 of 1804874.
Review 1070000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1075000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1080000 of 1804874.
Review 1085000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1090000 of 1804874.
Review 1095000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1100000 of 1804874.
Batch 22 cleaning completed.


In [388]:
cleaned_batch_23 = clean_data(23)
cleaned_batch_23.to_csv('../dataset/cleaned_batch_23.csv',index=False)

Review 1105000 of 1804874.
Review 1110000 of 1804874.
Review 1115000 of 1804874.
Review 1120000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1125000 of 1804874.
Review 1130000 of 1804874.
Review 1135000 of 1804874.
Review 1140000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1145000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1150000 of 1804874.
Batch 23 cleaning completed.


In [389]:
cleaned_batch_24 = clean_data(24)
cleaned_batch_24.to_csv('../dataset/cleaned_batch_24.csv',index=False)

Review 1155000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1160000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1165000 of 1804874.
Review 1170000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1175000 of 1804874.
Review 1180000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1185000 of 1804874.
Review 1190000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1195000 of 1804874.
Review 1200000 of 1804874.
Batch 24 cleaning completed.


In [390]:
cleaned_batch_25 = clean_data(25)
cleaned_batch_25.to_csv('../dataset/cleaned_batch_25.csv',index=False)

Review 1205000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1210000 of 1804874.
Review 1215000 of 1804874.
Review 1220000 of 1804874.
Review 1225000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1230000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1235000 of 1804874.
Review 1240000 of 1804874.
Review 1245000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1250000 of 1804874.
Batch 25 cleaning completed.


In [391]:
cleaned_batch_26 = clean_data(26)
cleaned_batch_26.to_csv('../dataset/cleaned_batch_26.csv',index=False)

Review 1255000 of 1804874.
Review 1260000 of 1804874.
Review 1265000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1270000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1275000 of 1804874.
Review 1280000 of 1804874.
Review 1285000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1290000 of 1804874.
Review 1295000 of 1804874.
Review 1300000 of 1804874.
Batch 26 cleaning completed.


In [392]:
cleaned_batch_27 = clean_data(27)
cleaned_batch_27.to_csv('../dataset/cleaned_batch_27.csv',index=False)

Review 1305000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1310000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1315000 of 1804874.
Review 1320000 of 1804874.
Review 1325000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1330000 of 1804874.
Review 1335000 of 1804874.
Review 1340000 of 1804874.
Review 1345000 of 1804874.
Review 1350000 of 1804874.
Batch 27 cleaning completed.


In [393]:
cleaned_batch_28 = clean_data(28)
cleaned_batch_28.to_csv('../dataset/cleaned_batch_28.csv',index=False)

Review 1355000 of 1804874.
Review 1360000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1365000 of 1804874.
Review 1370000 of 1804874.
Review 1375000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1380000 of 1804874.
Review 1385000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1390000 of 1804874.
Review 1395000 of 1804874.
Review 1400000 of 1804874.
Batch 28 cleaning completed.


In [394]:
cleaned_batch_29 = clean_data(29)
cleaned_batch_29.to_csv('../dataset/cleaned_batch_29.csv',index=False)

Review 1405000 of 1804874.
Review 1410000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1415000 of 1804874.
Review 1420000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1425000 of 1804874.
Review 1430000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1435000 of 1804874.
Review 1440000 of 1804874.
Review 1445000 of 1804874.
Review 1450000 of 1804874.
Batch 29 cleaning completed.


In [395]:
cleaned_batch_30 = clean_data(30)
cleaned_batch_30.to_csv('../dataset/cleaned_batch_30.csv',index=False)

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1455000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1460000 of 1804874.
Review 1465000 of 1804874.
Review 1470000 of 1804874.
Review 1475000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1480000 of 1804874.
Review 1485000 of 1804874.
Review 1490000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1495000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1500000 of 1804874.
Batch 30 cleaning completed.


In [396]:
cleaned_batch_31 = clean_data(31)
cleaned_batch_31.to_csv('../dataset/cleaned_batch_31.csv',index=False)

Review 1505000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1510000 of 1804874.
Review 1515000 of 1804874.
Review 1520000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1525000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1530000 of 1804874.
Review 1535000 of 1804874.
Review 1540000 of 1804874.
Review 1545000 of 1804874.
Review 1550000 of 1804874.
Batch 31 cleaning completed.


In [397]:
cleaned_batch_32 = clean_data(32)
cleaned_batch_32.to_csv('../dataset/cleaned_batch_32.csv',index=False)

Review 1555000 of 1804874.
Review 1560000 of 1804874.
Review 1565000 of 1804874.
Review 1570000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1575000 of 1804874.
Review 1580000 of 1804874.
Review 1585000 of 1804874.
Review 1590000 of 1804874.
Review 1595000 of 1804874.
Review 1600000 of 1804874.
Batch 32 cleaning completed.


In [398]:
cleaned_batch_33 = clean_data(33)
cleaned_batch_33.to_csv('../dataset/cleaned_batch_33.csv',index=False)

Review 1605000 of 1804874.
Review 1610000 of 1804874.
Review 1615000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1620000 of 1804874.
Review 1625000 of 1804874.
Review 1630000 of 1804874.
Review 1635000 of 1804874.
Review 1640000 of 1804874.
Review 1645000 of 1804874.
Review 1650000 of 1804874.
Batch 33 cleaning completed.


In [399]:
cleaned_batch_34 = clean_data(34)
cleaned_batch_34.to_csv('../dataset/cleaned_batch_34.csv',index=False)

Review 1655000 of 1804874.
Review 1660000 of 1804874.
Review 1665000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1670000 of 1804874.
Review 1675000 of 1804874.
Review 1680000 of 1804874.
Review 1685000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1690000 of 1804874.
Review 1695000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1700000 of 1804874.
Batch 34 cleaning completed.


In [400]:
cleaned_batch_35 = clean_data(35)
cleaned_batch_35.to_csv('../dataset/cleaned_batch_35.csv',index=False)

Review 1705000 of 1804874.
Review 1710000 of 1804874.
Review 1715000 of 1804874.
Review 1720000 of 1804874.
Review 1725000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1730000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1735000 of 1804874.
Review 1740000 of 1804874.
Review 1745000 of 1804874.
Review 1750000 of 1804874.
Batch 35 cleaning completed.


In [401]:
cleaned_batch_36 = clean_data(36)
cleaned_batch_36.to_csv('../dataset/cleaned_batch_36.csv',index=False)

Review 1755000 of 1804874.
Review 1760000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1765000 of 1804874.
Review 1770000 of 1804874.
Review 1775000 of 1804874.
Review 1780000 of 1804874.


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Review 1785000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1790000 of 1804874.
Review 1795000 of 1804874.


  ' Beautiful Soup.' % markup)


Review 1800000 of 1804874.
Batch 36 cleaning completed.


In [402]:
cleaned_batch_37 = clean_data(37)
cleaned_batch_37.to_csv('../dataset/cleaned_batch_37.csv',index=False)

Batch 37 cleaning completed.


In [403]:
## Merge the cleaned batch dataframes
df = [cleaned_batch_1,cleaned_batch_2,cleaned_batch_3,cleaned_batch_4,cleaned_batch_5,cleaned_batch_6,
     cleaned_batch_7,cleaned_batch_8,cleaned_batch_9,cleaned_batch_10,cleaned_batch_11,cleaned_batch_12,
     cleaned_batch_13,cleaned_batch_14,cleaned_batch_15,cleaned_batch_16,cleaned_batch_17,cleaned_batch_18,
     cleaned_batch_19,cleaned_batch_20,cleaned_batch_21,cleaned_batch_22,cleaned_batch_23,cleaned_batch_24,
     cleaned_batch_25,cleaned_batch_26,cleaned_batch_27,cleaned_batch_28,cleaned_batch_29,cleaned_batch_30,
     cleaned_batch_31,cleaned_batch_32,cleaned_batch_33,cleaned_batch_34,cleaned_batch_35,cleaned_batch_36,
     cleaned_batch_37]
cleaned_text_df = pd.concat(df,ignore_index=True)
cleaned_text_df.shape

(1804874, 2)

In [404]:
# Drop empty comment columns and duplicates
cleaned_text_df.drop_duplicates(inplace=True)
cleaned_text_df = cleaned_text_df[cleaned_text_df['comment'] != '']

In [405]:
cleaned_text_df.shape

(1760811, 2)

In [365]:
cleaned_text_df.tail()

Unnamed: 0,comment,target
899995,exactly quite apart expert analysis support re...,0.0
899996,funny trevor noah smug guess learn brexit trum...,0.166667
899997,good question three office member embrace week...,0.0
899998,alceste upset mortgage house turn try tell get...,0.0
899999,assertion without definition specifically wron...,0.0


In [407]:
# Merged toxic labels on cleaned dataframe
toxic_labels = ['severe_toxicity','obscene','threat','insult','identity_attack','sexual_explicit']
for i in toxic_labels:
    cleaned_text_df[i] = data[i]

In [408]:
# Merged identity labels on cleaned dataframe
identity_labels = ['male','female','transgender','other_gender','heterosexual','homosexual_gay_or_lesbian','bisexual','other_sexual_orientation',
                   'christian','jewish','muslim','hindu','buddhist','atheist','other_religion','black','white','asian','latino','other_race_or_ethnicity',
                   'physical_disability','intellectual_or_learning_disability','psychiatric_or_mental_illness','other_disability']

for i in identity_labels:
    cleaned_text_df[i] = data[i]

Store the final dataframe into a csv for later use.

In [406]:
cleaned_text_df.to_csv('../dataset/cleaned_text_df.csv',index=False)