In [4]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [5]:
pd.set_option('display.max_colwidth', None)

In [6]:
# Read in the cyberbullying raw data
df_cyberbullying = pd.read_csv('../Raw Data/cyberbullying_tweets.csv')

In [7]:
# Read in the toxic tweets raw data
df_abuse = pd.read_csv('../Raw Data/Toxic_tweets.csv')

In [8]:
# Rename the labels of each datapoint to match each other
df_cyberbullying['label'] = df_cyberbullying['label'].replace(-1, 'toxic')
df_cyberbullying['label'] = df_cyberbullying['label'].replace(0, 'not_toxic')

df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(0, 'not_toxic')
df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(1, 'toxic')

In [9]:
# Drop the extra column in abuse_df
df_abuse.drop(columns=['Unnamed: 0'], inplace=True)

In [10]:
# Swap the columns of df_abuse
df_abuse = df_abuse[['tweet', 'Toxicity']]

In [11]:
# Rename columns of both dataframes
df_abuse = df_abuse.rename(columns={'tweet': 'comment', 'Toxicity': 'label'})
df_cyberbullying = df_cyberbullying.rename(columns={'headline': 'comment', 'label': 'label'})

In [12]:
# Remove some non-English entries in the dataset
df_cyberbullying = df_cyberbullying.drop(df_cyberbullying.index[15307:18148])

In [13]:
# Combine the 2 dataframes into 1
df = pd.concat([df_abuse, df_cyberbullying])

In [14]:
# Reset the dataframe's index column
df.reset_index(inplace=True)

In [15]:
# Drop the extra index column in the dataframe
df.drop(columns=['index'], inplace=True)

In [16]:
# Remove all twitter handles and hashtags from the dataset
df['comment'] = df['comment'].str.replace(r'@([A-Za-z0-9_]+)', '')
df['comment'] = df['comment'].str.replace(r'#([A-Za-z0-9_]+)', '')

# Remove all punctuation from dataset
df['comment'] = df['comment'].str.replace(r'[^\w\s]+', '')
df['comment'] = df['comment'].str.replace(r'\d+', '')

# Lowercase all comments
df['comment'] = df['comment'].str.lower()

# Remove all non-ASCII characters in the dataset
df['comment'] = df['comment'].str.replace(r'[^\x00-\x7F]+', '')

# Trim excess whitespace around each entry
df['comment'] = df['comment'].str.strip()

  
  This is separate from the ipykernel package so we can avoid doing imports until
  
  import sys
  del sys.path[0]


In [17]:
# Lemmatize the corpus and remove stopwords
nltk.download('wordnet');
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatize_text(text):
    lemmatized = []
    for w in text.split(' '):
        if w not in stop_words and len(w) > 0:
            lemmatized.append(lemmatizer.lemmatize(w))
    
    cleaned_text = ' '.join(lemmatized)
    return cleaned_text

df['comment'] = df['comment'].apply(lemmatize_text)
df['comment'] = df['comment']

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikhildixit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# Replace all blank comments with NaN
df = df.replace(r'^s*$', float('NaN'), regex = True)

In [19]:
# Drop all NaN values
df.dropna(axis=0, inplace=True)

In [20]:
# Randomly shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

In [21]:
# Only keep the first 30,000 data points
df = df.head(30000)

In [22]:
df

Unnamed: 0,comment,label
0,life make let make,not_toxic
1,pete make think go around spread insult personal attack make think call arrogant rude kick would report ongoing behavior block maybe someone see page recruiting editor irrelevant read post mine yahoo see told everyone regardless position appreciate presence trying convince people come post pov mind fact someone come post pov completely beyond control finally discus edits discussed talk page week four editor edit make original proposal deleting entire section however preserved critical point paragraph moved reception steiner get rid problem undue weight,not_toxic
2,hot chocolate famous amos bomb right,not_toxic
3,homo sex ity french fried faggot going cut ball serve platter mushroom shoving dick as mutilating boyfriend ball front kill,toxic
4,whew productive,not_toxic
...,...,...
29995,rt much wife wet pussy want another dick head,toxic
29996,let say plain simple two,not_toxic
29997,orr thts lor bae tho wyd u fucking hoe,toxic
29998,hoe,toxic


In [23]:
df.to_csv('../Clean Data/clean_data.csv')