In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
from sentence_transformers import SentenceTransformer
from sent2vec.vectorizer import Vectorizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
from sklearn import preprocessing
from collections import Counter
from nltk.tokenize import word_tokenize

In [4]:
# Set pandas to display the whole dataframe
pd.set_option('display.max_colwidth', None)

In [5]:
# Read in the cyberbullying raw data
df_cyberbullying = pd.read_csv('../Raw Data/cyberbullying_tweets.csv')

In [6]:
# Read in the toxic tweets raw data
df_abuse = pd.read_csv('../Raw Data/Toxic_tweets.csv')

In [7]:
# Rename the labels of each datapoint to match each other
df_cyberbullying['label'] = df_cyberbullying['label'].replace(-1, 'toxic')
df_cyberbullying['label'] = df_cyberbullying['label'].replace(0, 'not_toxic')

df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(0, 'not_toxic')
df_abuse['Toxicity'] = df_abuse['Toxicity'].replace(1, 'toxic')

In [8]:
# Drop the extra column in abuse_df
df_abuse.drop(columns=['Unnamed: 0'], inplace=True)

In [9]:
# Swap the columns of df_abuse
df_abuse = df_abuse[['tweet', 'Toxicity']]

In [10]:
# Rename columns of both dataframes
df_abuse = df_abuse.rename(columns={'tweet': 'comment', 'Toxicity': 'label'})
df_cyberbullying = df_cyberbullying.rename(columns={'headline': 'comment', 'label': 'label'})

In [11]:
# Remove some non-English entries in the dataset
df_cyberbullying = df_cyberbullying.drop(df_cyberbullying.index[15307:18148])

In [12]:
# Randomly shuffle both dataframes
df_abuse = df_abuse.sample(frac=1).reset_index(drop=True)
df_cyberbullying = df_cyberbullying.sample(frac=1).reset_index(drop=True)

In [13]:
# Only keep the first 15000 entries of df_abuse (dataset is too large otherwise)
df_abuse = df_abuse.head(15000)

In [14]:
# Combine the 2 dataframes into 1
df = pd.concat([df_abuse, df_cyberbullying])

In [15]:
# Randomly shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

In [16]:
# Remove all twitter handles and hashtags from the dataset
df['comment'] = df['comment'].str.replace(r'@([A-Za-z0-9_]+)', '')
df['comment'] = df['comment'].str.replace(r'#([A-Za-z0-9_]+)', '')

# Remove all punctuation from dataset
df['comment'] = df['comment'].str.replace(r'[^\w\s]+', '')
df['comment'] = df['comment'].str.replace(r'\d+', '')

# Lowercase all comments
df['comment'] = df['comment'].str.lower()

# Remove all non-ASCII characters in the dataset
df['comment'] = df['comment'].str.replace(r'[^\x00-\x7F]+', '')

# Trim excess whitespace around each entry
df['comment'] = df['comment'].str.strip()

  
  This is separate from the ipykernel package so we can avoid doing imports until
  
  import sys
  del sys.path[0]


In [17]:
# Lemmatize the corpus and remove stopwords
nltk.download('wordnet');
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatize_text(text):
    lemmatized = []
    for w in text.split(' '):
        if w not in stop_words and len(w) > 0:
            lemmatized.append(lemmatizer.lemmatize(w))
    
    cleaned_text = ' '.join(lemmatized)
    return cleaned_text

df['comment'] = df['comment'].apply(lemmatize_text)
df['comment'] = df['comment']

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikhildixit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# Replace all blank comments with NaN
df = df.replace(r'^s*$', float('NaN'), regex = True)

In [19]:
# Drop all NaN values
df.dropna(axis=0, inplace=True)

In [20]:
# Reset the dataframe's index column
df.reset_index(inplace=True)

In [21]:
# Drop the extra index column in the dataframe
df.drop(columns=['index'], inplace=True)

In [22]:
df

Unnamed: 0,comment,label
0,buddy fruit brandon said dirty as coon,toxic
1,follow lineup announcement today,not_toxic
2,like said vacuous drivel patent nonsense skill talking as certainly mastered izak,toxic
3,huh threaten as hole content blank remove added content,toxic
4,fuck u fuck vandal,toxic
...,...,...
30160,notice polka dick course beautiful send pat comic deem worthy love reading,toxic
30161,hello go fuck kid adult talking,toxic
30162,rt video derek jeter hit walkoff single final atbat yankee stadium httptcojmbiagxu httptcocij,not_toxic
30163,know talking two primary source ambrosius gildas main source pseudo nennius make use gildas plus add snippet chronicle material folklore cited previous version article along secondary source bede geoffrey reliable modern scholarly source chadwick woolf gidlow fleuriot respectable scholar need leave article alone unclear source,not_toxic


In [23]:
sentences = df['comment'].tolist() # Turn the comment column into a list

In [24]:
model = SentenceTransformer('all-MiniLM-L6-v2') # Create the embedding model
input_embeddings = model.encode(sentences) # Embed the sentences

In [25]:
df_embeddings = pd.DataFrame(input_embeddings)
df_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.061659,0.042924,-0.027909,-0.094398,-0.005569,-0.060932,0.148430,0.000449,-0.032557,-0.025835,...,0.056826,-0.018621,-0.008933,0.019845,0.003539,0.004085,0.044936,0.043737,0.035842,-0.047291
1,-0.078584,-0.017030,0.012841,0.042564,0.011090,0.093269,0.005907,-0.015763,0.011048,0.052986,...,0.050060,0.085719,-0.048181,0.000932,0.033048,-0.052803,0.060913,-0.099276,0.015163,0.015926
2,-0.018492,0.012215,0.039271,-0.066744,-0.140989,-0.005288,0.044681,0.044251,0.012833,-0.001225,...,0.091907,0.055456,-0.060903,-0.009517,-0.064932,-0.026313,0.098399,0.038947,0.070080,-0.012449
3,-0.028316,0.073130,-0.026052,0.070508,-0.013783,-0.002817,-0.028664,-0.081183,0.103029,-0.003240,...,0.021038,-0.029412,0.114542,0.026422,0.009733,0.062683,0.031651,-0.008130,0.117193,-0.024545
4,0.075620,0.070810,0.005195,-0.102053,0.062714,-0.024888,0.065374,-0.006156,0.008878,-0.009752,...,0.030477,-0.046901,-0.055749,-0.011700,-0.008204,0.015479,-0.013746,0.034643,0.087525,-0.085699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30160,0.037771,-0.038717,-0.012030,-0.023955,-0.041778,0.007615,0.030302,-0.056470,-0.003735,-0.004116,...,0.018473,-0.088215,-0.032296,0.050388,-0.036271,0.108113,0.024633,0.000704,0.125703,-0.134695
30161,0.002563,0.072721,0.035123,0.020931,-0.025798,-0.079267,0.068680,-0.035519,0.053660,-0.033886,...,0.062581,0.022047,-0.020913,0.010086,-0.067338,0.001637,0.018102,0.053420,0.033898,-0.032134
30162,-0.062721,-0.001441,0.010116,-0.027680,0.058150,0.072341,-0.006970,0.090408,0.076522,-0.018395,...,0.025699,0.005768,-0.036862,0.026968,-0.112689,-0.041223,0.025079,0.049279,-0.032683,0.024724
30163,-0.003122,0.003426,-0.031749,0.018705,-0.028545,-0.054575,0.010559,-0.014771,0.020306,0.069420,...,-0.018666,-0.074635,0.024131,0.054823,0.033122,-0.046527,0.087907,0.004902,0.017821,0.073322


In [26]:
df.to_csv('../Clean Data/clean_data.csv')
df_embeddings.to_csv('../Clean Data/embeddings.csv')