In [82]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [83]:
# Set pandas to display the whole dataframe
pd.set_option('display.max_colwidth', None)

In [84]:
# Read in the cyberbullying raw data
df_cyberbullying = pd.read_csv('../Raw Data/cyberbullying_tweets.csv')

In [85]:
# Read in the toxic tweets raw data
df_abuse = pd.read_csv('../Raw Data/Toxic_tweets.csv')

In [86]:
# Rename the labels of each datapoint to match each other
df_cyberbullying['label'] = df_cyberbullying['label'].replace(-1, 'toxic')
df_cyberbullying['label'] = df_cyberbullying['label'].replace(0, 'not_toxic')

df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(0, 'not_toxic')
df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(1, 'toxic')

In [87]:
# Drop the extra column in abuse_df
df_abuse.drop(columns=['Unnamed: 0'], inplace=True)

In [88]:
# Swap the columns of df_abuse
df_abuse = df_abuse[['tweet', 'Toxicity']]

In [89]:
# Rename columns of both dataframes
df_abuse = df_abuse.rename(columns={'tweet': 'comment', 'Toxicity': 'label'})
df_cyberbullying = df_cyberbullying.rename(columns={'headline': 'comment', 'label': 'label'})

In [90]:
# Remove some non-English entries in the dataset
df_cyberbullying = df_cyberbullying.drop(df_cyberbullying.index[15307:18148])

In [91]:
# Randomly shuffle both dataframes
df_abuse = df_abuse.sample(frac=1).reset_index(drop=True)
df_cyberbullying = df_cyberbullying.sample(frac=1).reset_index(drop=True)

In [92]:
# Only keep the first 15000 entries of df_abuse (dataset is too large otherwise)
df_abuse = df_abuse.head(15000)

In [93]:
# Combine the 2 dataframes into 1
df = pd.concat([df_abuse, df_cyberbullying])

In [94]:
# Randomly shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

In [95]:
# Remove all twitter handles and hashtags from the dataset
df['comment'] = df['comment'].str.replace(r'@([A-Za-z0-9_]+)', '')
df['comment'] = df['comment'].str.replace(r'#([A-Za-z0-9_]+)', '')

# Remove all punctuation from dataset
df['comment'] = df['comment'].str.replace(r'[^\w\s]+', '')
df['comment'] = df['comment'].str.replace(r'\d+', '')

# Lowercase all comments
df['comment'] = df['comment'].str.lower()

# Remove all non-ASCII characters in the dataset
df['comment'] = df['comment'].str.replace(r'[^\x00-\x7F]+', '')

# Trim excess whitespace around each entry
df['comment'] = df['comment'].str.strip()

  
  This is separate from the ipykernel package so we can avoid doing imports until
  
  import sys
  del sys.path[0]


In [96]:
# Lemmatize the corpus and remove stopwords
nltk.download('wordnet');
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatize_text(text):
    lemmatized = []
    for w in text.split(' '):
        if w not in stop_words and len(w) > 0:
            lemmatized.append(lemmatizer.lemmatize(w))
    
    cleaned_text = ' '.join(lemmatized)
    return cleaned_text

df['comment'] = df['comment'].apply(lemmatize_text)
df['comment'] = df['comment']

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikhildixit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [97]:
# Replace all blank comments with NaN
df = df.replace(r'^s*$', float('NaN'), regex = True)

In [98]:
# Drop all NaN values
df.dropna(axis=0, inplace=True)

In [99]:
# Reset the dataframe's index column
df.reset_index(inplace=True)

In [100]:
# Drop the extra index column in the dataframe
df.drop(columns=['index'], inplace=True)

In [101]:
df

Unnamed: 0,comment,label
0,dude dude stop busting ball,toxic
1,offended birther analogy consider analogy including title like dark moon apollo whistle blower bill kaysing see also section neil armstrong buzz aldrin fortunately book article yet stand amazon best seller list compared montford obviously moon landing hoax conspiracy theory quite bit obvious piece fringe nonsense yet sale figure would indicate taken seriously many except nasa partisan namely almost everybody,not_toxic
2,merge suggested article branding iron merged one,not_toxic
3,haha fuck edit page midarme cant mdiarme cant cause bitch wait,toxic
4,intended go page,not_toxic
...,...,...
30183,pansy multiple source reporting shot french warplane stop censoring wikipedia as hole claim libya manipulating report concidering coalition also taking account manipulating report cencoring information suck agsman,toxic
30184,sunday mood allanxreyes,not_toxic
30185,panel antitrump pundit felt like obviously pathetic attempt sway undecided voter suppoing trump,not_toxic
30186,bitch stop spreading mongol propaganda anything make bulgars look like bad guy must remove order spread propaganda like macedonian forum may add end getting owned would like take look ethnic map yugoslavia created nazi macedonian included seperate ethnic group right bulgaren mazedonier let closer look shall full cleaned nob article removed irrelevant information like stupid shit imro involvement talked war word count included bulgaria time macedonia entire article said take mongol propaganda shove as nazi,toxic


In [102]:
df.to_csv('../Clean Data/clean_data.csv')