In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
from sentence_transformers import SentenceTransformer
from sent2vec.vectorizer import Vectorizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
from sklearn import preprocessing
from collections import Counter
from nltk.tokenize import word_tokenize

In [4]:
# Set pandas to display the whole dataframe
pd.set_option('display.max_colwidth', None)

In [5]:
# Read in the cyberbullying raw data
df_cyberbullying = pd.read_csv('../Raw Data/cyberbullying_tweets.csv')

In [6]:
# Read in the toxic tweets raw data
df_abuse = pd.read_csv('../Raw Data/Toxic_tweets.csv')

In [7]:
# Rename the labels of each datapoint to match each other
df_cyberbullying['label'] = df_cyberbullying['label'].replace(-1, 'toxic')
df_cyberbullying['label'] = df_cyberbullying['label'].replace(0, 'not_toxic')

df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(0, 'not_toxic')
df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(1, 'toxic')

In [8]:
# Drop the extra column in abuse_df
df_abuse.drop(columns=['Unnamed: 0'], inplace=True)

In [9]:
# Swap the columns of df_abuse
df_abuse = df_abuse[['tweet', 'Toxicity']]

In [10]:
# Rename columns of both dataframes
df_abuse = df_abuse.rename(columns={'tweet': 'comment', 'Toxicity': 'label'})
df_cyberbullying = df_cyberbullying.rename(columns={'headline': 'comment', 'label': 'label'})

In [11]:
# Remove some non-English entries in the dataset
df_cyberbullying = df_cyberbullying.drop(df_cyberbullying.index[15307:18148])

In [12]:
# Randomly shuffle both dataframes
df_abuse = df_abuse.sample(frac=1).reset_index(drop=True)
df_cyberbullying = df_cyberbullying.sample(frac=1).reset_index(drop=True)

In [13]:
# Only keep the first 15000 entries of df_abuse (dataset is too large otherwise)
df_abuse = df_abuse.head(15000)

In [14]:
# Combine the 2 dataframes into 1
df = pd.concat([df_abuse, df_cyberbullying])

In [15]:
# Randomly shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

In [16]:
# Remove all twitter handles and hashtags from the dataset
df['comment'] = df['comment'].str.replace(r'@([A-Za-z0-9_]+)', '')
df['comment'] = df['comment'].str.replace(r'#([A-Za-z0-9_]+)', '')

# Remove all punctuation from dataset
df['comment'] = df['comment'].str.replace(r'[^\w\s]+', '')
df['comment'] = df['comment'].str.replace(r'\d+', '')

# Lowercase all comments
df['comment'] = df['comment'].str.lower()

# Remove all non-ASCII characters in the dataset
df['comment'] = df['comment'].str.replace(r'[^\x00-\x7F]+', '')

# Trim excess whitespace around each entry
df['comment'] = df['comment'].str.strip()

  
  This is separate from the ipykernel package so we can avoid doing imports until
  
  import sys
  del sys.path[0]


In [17]:
# Lemmatize the corpus and remove stopwords
nltk.download('wordnet');
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatize_text(text):
    lemmatized = []
    for w in text.split(' '):
        if w not in stop_words and len(w) > 0:
            lemmatized.append(lemmatizer.lemmatize(w))
    
    cleaned_text = ' '.join(lemmatized)
    return cleaned_text

df['comment'] = df['comment'].apply(lemmatize_text)
df['comment'] = df['comment']

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikhildixit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# Replace all blank comments with NaN
df = df.replace(r'^s*$', float('NaN'), regex = True)

In [19]:
# Drop all NaN values
df.dropna(axis=0, inplace=True)

In [20]:
# Reset the dataframe's index column
df.reset_index(inplace=True)

In [21]:
# Drop the extra index column in the dataframe
df.drop(columns=['index'], inplace=True)

In [22]:
df

Unnamed: 0,comment,label
0,still stub,not_toxic
1,shut going rape sister,toxic
2,address said easily visible mean contact wrong provide link suggest every single one involved read statement improving page instead reverting whenever possible nice people assuming good faith trying resolve dispute issuing warning blocking paid attention,not_toxic
3,want girl whose pussy smell like tuna,toxic
4,haha still little bitch though,toxic
...,...,...
30183,rt arent bitch,toxic
30184,us song called warm kane beginning rapper big daddy kane big daddy thing album,not_toxic
30185,v shooting amp co tonight thanks gonna good gonna great,not_toxic
30186,one objected proposed section circumcision fetish gilgal fellow assume consensus delegate research get back asap,not_toxic


In [23]:
sentences = df['comment'].tolist() # Turn the comment column into a list

In [24]:
model = SentenceTransformer('all-MiniLM-L6-v2') # Create the embedding model
input_embeddings = model.encode(sentences) # Embed the sentences

In [25]:
df_embeddings = pd.DataFrame(input_embeddings)
df_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.057932,-0.050699,-0.018912,-0.009826,-0.010070,-0.050693,-0.014126,0.064496,-0.031303,-0.037353,...,0.017861,-0.008970,-0.012532,-0.057388,-0.072814,0.015746,0.123562,-0.112824,0.043558,0.030104
1,0.061240,0.072413,-0.055894,0.004737,0.027783,-0.033057,0.077130,-0.026322,0.099640,0.017140,...,0.091248,0.018495,0.007123,0.000334,-0.003742,0.036440,0.020731,-0.047870,0.023361,-0.085709
2,-0.095022,0.027380,0.042781,0.051248,-0.008985,0.010304,0.033868,-0.028216,-0.034435,-0.028091,...,-0.042843,0.000127,0.059880,0.025842,0.020706,0.011508,0.055526,-0.001586,-0.015047,0.081791
3,-0.090755,-0.069451,0.069232,-0.016478,-0.062734,-0.048155,0.126363,-0.028282,0.024995,-0.050577,...,0.025488,0.007893,-0.046482,0.016336,0.001823,0.008136,0.107453,0.059832,0.015946,-0.049930
4,0.041692,-0.052663,0.038262,0.043602,0.017715,-0.049124,0.066899,-0.002028,0.050640,0.008004,...,0.088411,-0.029898,0.020716,-0.068933,0.028030,0.010374,0.029054,-0.032414,0.009079,0.025800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30183,-0.029536,-0.061671,0.015002,0.061124,-0.053683,-0.000559,0.129367,0.025699,0.019967,0.049022,...,0.052163,0.020884,0.001558,-0.074392,-0.034704,0.012334,-0.005164,-0.027775,0.042330,-0.080739
30184,-0.056714,0.022325,-0.055365,0.043365,0.008140,-0.030472,0.053346,-0.009147,-0.096120,0.007934,...,-0.042529,-0.030869,-0.030660,0.021119,0.011152,-0.035333,0.085111,-0.024608,0.008348,-0.019476
30185,-0.045423,-0.001571,0.033592,-0.023650,-0.077973,0.098673,0.029494,-0.002361,-0.028838,0.045078,...,0.078066,0.044105,-0.021349,0.030237,-0.003708,0.013246,0.054398,0.004014,-0.015701,-0.044503
30186,-0.037155,0.060574,0.040728,0.037096,0.028945,-0.056856,0.008648,-0.014302,-0.035258,0.048966,...,0.055452,0.009304,0.080825,-0.002116,-0.097484,0.021749,0.082275,-0.060607,-0.045381,0.017792


In [26]:
df.to_csv('../Clean Data/clean_data.csv')
df_embeddings.to_csv('../Clean Data/embeddings.csv')