In [125]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [126]:
pd.set_option('display.max_colwidth', None)

In [127]:
# Read in the cyberbullying raw data
df_cyberbullying = pd.read_csv('./Raw Data/cyberbullying_tweets.csv')

In [128]:
# Read in the toxic tweets raw data
df_abuse = pd.read_csv('./Raw Data/Toxic_tweets.csv')

In [129]:
# Rename the labels of each datapoint to match each other
df_cyberbullying['label'] = df_cyberbullying['label'].replace(-1, 'toxic')
df_cyberbullying['label'] = df_cyberbullying['label'].replace(0, 'not_toxic')

df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(0, 'not_toxic')
df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(1, 'toxic')

In [130]:
# Drop the extra column in abuse_df
df_abuse.drop(columns=['Unnamed: 0'], inplace=True)

In [131]:
# Swap the columns of df_abuse
df_abuse = df_abuse[['tweet', 'Toxicity']]

In [132]:
# Rename columns of both dataframes
df_abuse = df_abuse.rename(columns={'tweet': 'comment', 'Toxicity': 'label'})
df_cyberbullying = df_cyberbullying.rename(columns={'headline': 'comment', 'label': 'label'})

In [133]:
# Remove some non-English entries in the dataset
df_cyberbullying = df_cyberbullying.drop(df_cyberbullying.index[15307:18148])

In [134]:
# Combine the 2 dataframes into 1
df = pd.concat([df_abuse, df_cyberbullying])

In [135]:
# Reset the dataframe's index column
df.reset_index(inplace=True)

In [136]:
# Drop the extra index column in the dataframe
df.drop(columns=['index'], inplace=True)

In [137]:
# Remove all twitter handles and hashtags from the dataset
df['comment'] = df['comment'].str.replace(r'@([A-Za-z0-9_]+)', '')
df['comment'] = df['comment'].str.replace(r'#([A-Za-z0-9_]+)', '')

# Remove all punctuation from dataset
df['comment'] = df['comment'].str.replace(r'[^\w\s]+', '')
df['comment'] = df['comment'].str.replace(r'\d+', '')

# Lowercase all comments
df['comment'] = df['comment'].str.lower()

# Remove all non-ASCII characters in the dataset
df['comment'] = df['comment'].str.replace(r'[^\x00-\x7F]+', '')

# Trim excess whitespace around each entry
df['comment'] = df['comment'].str.strip()

  
  This is separate from the ipykernel package so we can avoid doing imports until
  
  import sys
  del sys.path[0]


In [138]:
# Lemmatize the corpus and remove stopwords
nltk.download('wordnet');
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatize_text(text):
    lemmatized = []
    for w in text.split(' '):
        if w not in stop_words and len(w) > 0:
            lemmatized.append(lemmatizer.lemmatize(w))
    
    cleaned_text = ' '.join(lemmatized)
    return cleaned_text

df['comment'] = df['comment'].apply(lemmatize_text)
df['comment'] = df['comment']

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikhildixit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [139]:
# Replace all blank comments with NaN
df = df.replace(r'^s*$', float('NaN'), regex = True)

In [140]:
# Drop all NaN values
df.dropna(axis=0, inplace=True)

In [141]:
# Randomly shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

In [142]:
df

Unnamed: 0,comment,label
0,went sister makeup cant find sample lipstick got looking forward using,not_toxic
1,car selfie,not_toxic
2,im finally get get,not_toxic
3,rt k michelle apart baltimore twitter shed one hoe saying shes celibate smh,toxic
4,rt really get excited see bad bitch association lunch everyday always make laugh,toxic
...,...,...
71584,he lying faggot used inspect element change number dollar account,toxic
71585,redirect talk g u district,not_toxic
71586,victim adam saleh refers black people abeed slave one sketch,toxic
71587,ive year mostly amp even leave die,toxic


In [143]:
df.to_csv('./Clean Data/clean_data.csv')