# Data fetcher and tokenizer

## Hardware setup

In [1]:
import torch
from torch import nn, cuda
import pandas as pd
import numpy as np

In [2]:
# print hardware
print("Devices:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
  print("- ", torch.cuda.get_device_name(i))

Devices: 1
-  Tesla T4


## Data fetching

In [6]:
# by-source datasets
youtube_url = 'https://s3-eu-west-1.amazonaws.com/pstorage-mendeley-9030361288/22895576/youtube_parsed_dataset.csv'
twitter_url = 'https://s3-eu-west-1.amazonaws.com/pstorage-mendeley-9030361288/22895537/twitter_parsed_dataset.csv'
kaggle_url = 'https://s3-eu-west-1.amazonaws.com/pstorage-mendeley-9030361288/22895477/kaggle_parsed_dataset.csv'

# sentiment datasets
aggression_url = 'https://s3-eu-west-1.amazonaws.com/pstorage-mendeley-9030361288/22895468/aggression_parsed_dataset.csv'
attack_url = 'https://s3-eu-west-1.amazonaws.com/pstorage-mendeley-9030361288/22895471/attack_parsed_dataset.csv'
toxicity_url = 'https://s3-eu-west-1.amazonaws.com/pstorage-mendeley-9030361288/22895489/toxicity_parsed_dataset.csv'
twitter_racism_url = 'https://s3-eu-west-1.amazonaws.com/pstorage-mendeley-9030361288/22895549/twitter_racism_parsed_dataset.csv'
twitter_sexism_url = 'https://s3-eu-west-1.amazonaws.com/pstorage-mendeley-9030361288/22895561/twitter_sexism_parsed_dataset.csv'

In [7]:
def parse_figshare_dataset(url, sentiment = None, source = None, columns = {}):
  """FigShare dataset downloading and parsing."""
  # override columns
  columns = pd.Series({'Text': 'text', 'ed_label_1': 'score','oh_label': 'label', **columns})
  columns = columns[~columns.isna()].to_dict()
  
  # download
  x = pd.read_csv(url)\
    .rename(columns = columns) # rename columns
  # add sentiment columns
  if sentiment is not None:
    x['sentiment'] = x[columns['oh_label']].apply(
        lambda v: sentiment if bool(v) else False
    )
    columns['sentiment'] = 'sentiment'
  # add source
  x['source'] = source if source is not None else 'unknown'
  columns['source'] = 'source'
  # project only wanted columns
  return x[[c for c in columns.values()]]

# parse datasets
aggression_df = parse_figshare_dataset(aggression_url, 'aggression')
attack_df     = parse_figshare_dataset(attack_url, 'attack')
toxicity_df   = parse_figshare_dataset(toxicity_url, 'toxicity')

# drop score column
aggression_df = aggression_df.drop('score', axis = 1)
attack_df     = attack_df.drop('score', axis = 1)
toxicity_df   = toxicity_df.drop('score', axis = 1)

In [8]:
def parse_figshare_twitter_dataset(*args, **kwargs):
  """FigShare Twitter dataset downloading and parsing."""
  cols = {'ed_label_1': None, 'Annotation': 'sentiment'}
  x = parse_figshare_dataset(*args, **kwargs, source = 'twitter', columns = cols)
  x['sentiment'] = x.sentiment.apply(lambda s: s if s != "none" else False)

# parse datasets
twitter_sexism_df = parse_figshare_twitter_dataset(twitter_sexism_url)
twitter_racism_df = parse_figshare_twitter_dataset(twitter_racism_url)
twitter_df        = parse_figshare_twitter_dataset(twitter_url)

In the sentiment column, *bully* is general type of negative sentiment.

In [9]:
def parse_figshare_youtube_kaggle_dataset(*args, **kwargs):
  """FigShare YouTube dataset downloading and parsing."""
  x = parse_figshare_dataset(*args, **kwargs,
                             sentiment = 'bully', columns = {'ed_label_1': None})

# parse dataset - bully is a general type of negative sentiment
youtube_df = parse_figshare_youtube_kaggle_dataset(youtube_url, source = 'youtube')
kaggle_df = parse_figshare_youtube_kaggle_dataset(kaggle_url, source = 'kaggle')

## Merging of data sets

In [10]:
# concatenate the datasets
df = pd.concat([aggression_df, attack_df, toxicity_df,
                twitter_sexism_df, twitter_racism_df,
                twitter_df, youtube_df, kaggle_df])
print(df.shape)

(391414, 4)


In [11]:
# drop duplicated comments and NA rows
df = df.drop_duplicates(subset = "text") # drops sentiment from many
print("After deduplicating:", df.shape)
df = df.dropna()
print("After removing empty:", df.shape)
# drop sentiment totally (for now)
try: # if ran several times
  df = df.drop('sentiment', axis = 1)
  print("After removing sentiment:", df.shape)
except: pass

After deduplicating: (196292, 4)
After removing empty: (196292, 4)
After removing sentiment: (196292, 3)


## Tokenization

Lemmatization is not done, because the particular form of the word in case of generating is important.

In [17]:
# load English tokenizer
import spacy
nlp = spacy.load("en_core_web_sm",
                 disable=["tagger","parser","ner","textcat"]) # to speed up

# implement tokenizer
def any_alnum(x):
  return any([c.isalnum() for c in x])
def preprocess_words(text):
  return [tok.text for tok in nlp(text) if any_alnum(tok.text)]

If tagging is done, the task is very computationally intensive. Hence only tokenization is used (for now).

In [14]:
# install and setyp PySpark (parallel processing)
#!pip install pyspark
#from pyspark import SparkContext,SparkConf

# turn on logs
#import logging
#logging.basicConfig(level = logging.INFO)

# create spark config
#conf = SparkConf().set("spark.ui.showConsoleProgress", "true")

# create spark context
#try: # for reruning
#  sc = SparkContext("local", "First App", conf = conf)
#except Exception as e: print(e)



In [None]:
def preprocess_words_tags(text): # to do this, load tagger from spacy
  return [(tok.text, tok.tag_,tok.tag) for tok in nlp(text) if any_alnum(tok.text)]

In [16]:
# tokenize in spark
#words = sc\
#  .parallelize(df.text)\
#  .map(preprocess_words_tags)

#output = "/content/gdrive/words.csv"

# remove output file (overwrite)
#import os, shutil
#try: shutil.rmtree(output)
#except Exception as e: print(e)

# save
#def toCSVLine(data):
#  return ','.join(str(d) for d in data)
#words\
#  .sample(withReplacement = False, fraction = 10**-4, seed = 12345)\
#  .map(toCSVLine)\
#  .coalesce(1)\
#  .saveAsTextFile(output)

# flush file to drive
#from google.colab import drive
#drive.flush_and_unmount()

#print(words.take(5))

KeyboardInterrupt: ignored

In [18]:
# tokenization 
progress_bar = True

if progress_bar:
  from tqdm.notebook import tqdm_notebook
  tqdm_notebook.pandas()
  df['text'] = df.text.progress_apply(preprocess_words)
else:
  df['text'] = df.text.apply(preprocess_words)

HBox(children=(FloatProgress(value=0.0, max=196292.0), HTML(value='')))




In [25]:
# text to str
df['text'] = df.text.apply(str)

# mount drive
from google.colab import drive
drive.mount('/drive', force_remount=True)

# output to csv (to drive)
df.to_csv('/drive/My Drive/Colab Notebooks/data/words.csv', index = False)

Mounted at /drive
