In [261]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [262]:
pd.set_option('display.max_colwidth', None)

In [263]:
# Read in the cyberbullying raw data
df_cyberbullying = pd.read_csv('./Raw Data/cyberbullying_tweets.csv')

In [264]:
# Read in the toxic tweets raw data
df_abuse = pd.read_csv('./Raw Data/Toxic_tweets.csv')

In [265]:
# Rename the labels of each datapoint to match each other
df_cyberbullying['label'] = df_cyberbullying['label'].replace(-1, 'toxic')
df_cyberbullying['label'] = df_cyberbullying['label'].replace(0, 'not_toxic')

df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(0, 'not_toxic')
df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(1, 'toxic')

In [266]:
# Drop the extra column in abuse_df
df_abuse.drop(columns=['Unnamed: 0'], inplace=True)

In [267]:
# Swap the columns of df_abuse
df_abuse = df_abuse[['tweet', 'Toxicity']]

In [268]:
# Rename columns of both dataframes
df_abuse = df_abuse.rename(columns={'tweet': 'comment', 'Toxicity': 'label'})
df_cyberbullying = df_cyberbullying.rename(columns={'headline': 'comment', 'label': 'label'})

In [269]:
# Remove some non-English entries in the dataset
df_cyberbullying = df_cyberbullying.drop(df_cyberbullying.index[15307:18148])

In [270]:
# Combine the 2 dataframes into 1
df = pd.concat([df_abuse, df_cyberbullying])

In [271]:
# Reset the dataframe's index column
df.reset_index(inplace=True)

In [272]:
# Drop the extra index column in the dataframe
df.drop(columns=['index'], inplace=True)

In [273]:
# Remove all twitter handles and hashtags from the dataset
df['comment'] = df['comment'].str.replace(r'@([A-Za-z0-9_]+)', '')
df['comment'] = df['comment'].str.replace(r'#([A-Za-z0-9_]+)', '')

# Remove all punctuation from dataset
df['comment'] = df['comment'].str.replace(r'[^\w\s]+', '')
df['comment'] = df['comment'].str.replace(r'\d+', '')

# Lowercase all comments
df['comment'] = df['comment'].str.lower()

# Remove all non-ASCII characters in the dataset
df['comment'] = df['comment'].str.replace(r'[^\x00-\x7F]+', '')

# Trim excess whitespace around each entry
df['comment'] = df['comment'].str.strip()

  
  This is separate from the ipykernel package so we can avoid doing imports until
  
  import sys
  del sys.path[0]


In [274]:
# Lemmatize the corpus and remove stopwords
nltk.download('wordnet');
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatize_text(text):
    lemmatized = []
    for w in text.split(' '):
        if w not in stop_words and len(w) > 0:
            lemmatized.append(lemmatizer.lemmatize(w))
    
    cleaned_text = ' '.join(lemmatized)
    return cleaned_text

df['comment'] = df['comment'].apply(lemmatize_text)
df['comment'] = df['comment']

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikhildixit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [275]:
# Randomly shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

In [276]:
df

Unnamed: 0,comment,label
0,u see voice singer christina grimmie dy shooting,not_toxic
1,actually dude try edit anything page get whole user block page explaining etc,not_toxic
2,herpes vagina,toxic
3,cant treat hoe like lady,toxic
4,ahhhh might guessed,not_toxic
...,...,...
72047,fair use rationale image z butterfinger jpg thanks uploading contributing image z butterfinger jpg notice image page specifies image used fair use explanation rationale use wikipedia article constitutes fair use addition boilerplate fair use template must also write image description page specific explanation rationale using image article consistent fair use suggestion found please go image description page edit include fair use rationale using one template wikipedia fair use rationale guideline easy way ensure image compliance wikipedia policy remember must complete template simply insert blank template image page uploaded fair use medium consider checking specified fair use rationale page find list image page edited clicking contribution link located top wikipedia page logged selecting image dropdown box note non free medium lacking explanation deleted one week uploaded described criterion speedy deletion question please ask medium copyright question page thank,not_toxic
72048,put single hello joined wikipedia usage learning site try add input inprove page knew missing information noticed removed added put used single list although made video also released single artwork cover recieved airplay radio across usa make proper single see information http w x com page newmusic used put blockid feedid much information confirming single simple google search used put single believe fully make legit added list unless may miss interpeting something thank fez help,not_toxic
72049,article piece shit read weep jw fuck ing little high school piece shit one give crap opinion citation piece shit article going delete crap,toxic
72050,litter act sect shit try qld act amendment get shit letter box,toxic


In [277]:
df.to_csv('./Clean Data/clean_data.csv')