In [2]:
import pandas as pd
import numpy as np
import re

In [1]:
def determine_final_toxicity(row): 
    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    for i in labels: 
        if row[i] == 1: 
            return 1
    return 0

# Train Set

In [4]:
train = pd.read_csv('../data/train.csv') 
train['final_toxicity'] = train.apply(determine_final_toxicity, axis=1)
train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,final_toxicity
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0,0


In [None]:
#export original training set
train.to_csv('train_original_new_column.csv', index=False)

In [4]:
def preprocess(df):
    
    # Lowercase letters
    df['comment_text'] = df['comment_text'].str.lower()
    
    # Remove special characters (i.e. @, $, %)
    df['comment_text'] = df['comment_text'].str.replace("[^a-z0-9!@#\$%\^\&\*_\-,\.' ]", " ") 
    
    # Remove punctuations - if you don't want to remove this, comment below out
    df['comment_text'] = df['comment_text'].str.replace('[^\w\s]', "")
    
    # Remove random numbers
    df['comment_text'] = df['comment_text'].str.replace("[^a-z' ]", "")
    
    # Remove repeating characters and extra spaces 
    # Examples: 'whaaat' --> 'what' OR 'hello   bye' --> 'hello bye'
    # Note: This does change words like "look" to "lok" cuz of the double letters, 
    # if we don't want this, we can remove the \w in the regex pattern below.
    repeat_pattern = re.compile(r'(\w|\s)\1*')
    match_substitution = r'\1'
    df['comment_text'] = df['comment_text'].apply(lambda x: repeat_pattern.sub(match_substitution, x))
    
    return df

In [5]:
clean = preprocess(train) #may take a few minutes to finish running

In [6]:
clean.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,final_toxicity
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,daw he matches this background colour im semin...,0,0,0,0,0,0,0
2,000113f07ec002fd,hey man im realy not trying to edit war its ju...,0,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cant make any real sugestions on impro...,0,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0,0


In [7]:
#export cleaned training set
clean.to_csv('train_cleaned_new_column.csv', index=False)

# Test Set

In [9]:
test_labels = pd.read_csv('test_labels.csv')
test_labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [10]:
test_labels_new = test_labels[test_labels.toxic != -1]
test_labels_new['final_toxicity'] = test_labels_new.apply(determine_final_toxicity, axis=1)
test_labels_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels_new['final_toxicity'] = test_labels_new.apply(determine_final_toxicity, axis=1)


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,final_toxicity
5,0001ea8717f6de06,0,0,0,0,0,0,0
7,000247e83dcc1211,0,0,0,0,0,0,0
11,0002f87b16116a7f,0,0,0,0,0,0,0
13,0003e1cccfd5a40a,0,0,0,0,0,0,0
14,00059ace3e3e9a53,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,0,0,0,0,0,0,0
153151,fff9d70fe0722906,0,0,0,0,0,0,0
153154,fffa8a11c4378854,0,0,0,0,0,0,0
153155,fffac2a094c8e0e2,1,0,1,0,1,0,1


In [11]:
#filter out test set
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [12]:
test_new = test[test.id.isin(test_labels_new.id)]
test_new.head()

Unnamed: 0,id,comment_text
5,0001ea8717f6de06,Thank you for understanding. I think very high...
7,000247e83dcc1211,:Dear god this site is horrible.
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig..."
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ..."
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l..."


In [13]:
#export new column test labels
test_labels_new.to_csv('test_labels_new_column.csv', index=False)

In [14]:
#export new column test set
test_new.to_csv('test_filtered.csv', index=False)