In [1]:
import h2o
from h2o.estimators.word2vec import H2OWord2vecEstimator
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import pandas as pd
import numpy as np
import re
import textstat
from functools import partial


In [2]:
# Preparations
with open('/home/mackenzie/Downloads/slang.txt') as file:
    slang_map = dict(map(str.strip, line.partition('\t')[::2])
    for line in file if line.strip())

slang_words = sorted(slang_map, key=len, reverse=True)
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])
STOP_WORDS = set(stopwords.words('english'))
new_stopwords = ['rt'] 
STOP_WORDS = STOP_WORDS.union(new_stopwords)

FileNotFoundError: [Errno 2] No such file or directory: '/home/mackenzie/Downloads/slang.txt'

In [3]:
# All Feature Extraction Functions

# functions for H2o Word2Vec
def tokenizeFunc(sentences, stop_word = STOP_WORDS):
    df = sentences.as_data_frame()
    df = df.astype(str)
    sentence = h2o.H2OFrame(python_obj=df, column_types=["string"])
    tokenized = sentence.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

def h2o_w2vec(data, str):
    print("Break " + str + " into sequence of words")
    words = tokenizeFunc(data)
    print("Build word2vec model for " + str)
    w2v_model = H2OWord2vecEstimator(sent_sample_rate=0.0, epochs=10)
    w2v_model.train(training_frame=words)
    vecs = w2v_model.transform(words, aggregate_method="AVERAGE")
    return vecs

# NGrams Freq function
def ngrams(text, min_n, max_n, str):
    print("Completing ngram generation for " + str)
    bv = CountVectorizer(ngram_range=(min_n, max_n), max_features=1000)
    bv_matrix = bv.fit_transform(text).toarray()
    bv_vocab = bv.get_feature_names()
    bv_data = pd.DataFrame(bv_matrix, columns=bv_vocab)
    return bv_data

# Char-NGrams Freq function
def char_ngrams(text, min_n, max_n, str):
    print("Completing ngram generation for " + str)
    bv = CountVectorizer(ngram_range=(min_n, max_n), max_features=1000, analyzer='char_wb')
    bv_matrix = bv.fit_transform(text).toarray()
    bv_vocab = bv.get_feature_names()
    bv_data = pd.DataFrame(bv_matrix, columns=bv_vocab)
    return bv_data

# TFIDF function
def tfidf(text, min_n, max_n, str):
    print("Completing tfidf+ngram generation for " + str)
    tv = TfidfVectorizer(ngram_range=(min_n, max_n), max_features=1000)
    tv_matrix = tv.fit_transform(text).toarray()
    tv_vocab = tv.get_feature_names()
    tv_data = pd.DataFrame(np.round(tv_matrix, 2), columns=tv_vocab)
    return tv_data

# Char-TFIDF function
def char_tfidf(text, min_n, max_n, str):
    print("Completing tfidf+ngram generation for " + str)
    tv = TfidfVectorizer(ngram_range=(min_n, max_n), max_features=1000, analyzer='char_wb')
    tv_matrix = tv.fit_transform(text).toarray()
    tv_vocab = tv.get_feature_names()
    tv_data = pd.DataFrame(np.round(tv_matrix, 2), columns=tv_vocab)
    return tv_data


# Sentiment Analysis function
def sentimentAnalyzer(tweets, str):
    sid = SentimentIntensityAnalyzer()
    sentiment_val = pd.DataFrame(columns = ['sentiment'])
    print("Completing the sentiment analysis for " + str)
    for i in range(0, len(tweets)):
        ss = sid.polarity_scores(tweets[i])
        sentiment_val.at[i, 'sentiment'] = ss.get('compound')
    return sentiment_val

# Linguistic Feature Extraction 
def text_length(text):
    return len(text)

def number_of_tokens(text):
    tokens = nltk.word_tokenize(text)
    return len(tokens)

def is_retweet(text):
    tokens = nltk.word_tokenize(text)
    if 'RT' in tokens:
        return 1
    else:
        return 0

def number_of_mentions(text):
    return len(re.findall(r"@\S+", text))

def number_of_hashtags(text):
    return len(re.findall(r"#\S+", text))

def number_of_links(text):
    return len(re.findall(r"http\S+", text))

def number_of_elongated(text):
    regex = re.compile(r"(.)\1{2}")
    return len([word for word in text.split() if regex.search(word)])

def number_of_slangs(text):
    slang_counter = 0
    tokens = nltk.word_tokenize(text)
    for word in tokens:
        if word in slang_words:
            slang_counter += 1
    return slang_counter

def number_of_emoticons(text):
    return len(re.findall(r"&#\S+", text))

def linguisticFeatures(tweets, str):
    print("Completing the liguistic feature extraction for " + str)
    tl, irt, nom, noh, nol, noem, nt, noel, nos = [], [], [], [], [], [], [], [], []
    for i in range(0, len(tweets)):
        tl.append(text_length(tweets[i]))
        irt.append(is_retweet(tweets[i]))
        nom.append(number_of_mentions(tweets[i]))
        noh.append(number_of_hashtags(tweets[i]))
        nol.append(number_of_links(tweets[i]))
        noem.append(number_of_emoticons(tweets[i]))
        nt.append(number_of_tokens(tweets[i]))
        noel.append(number_of_elongated(tweets[i]))
        nos.append(number_of_slangs(tweets[i]))
    features = pd.DataFrame()
    features['text length'] = tl
    features['number of words'] = nt
    features['retweet'] = irt
    features['number of mentions'] = nom
    features['number of hashtags'] = noh
    features['number of links'] = nol
    features['number of elongated'] = noel
    features['number of slangs'] = nos
    features['number of emoticons'] = noem
    return features

# Textstat score extraction
def remove_user_names(text):
    text = re.sub(r'@\S+', '', text)
    return text

def remove_hashtags(text):
    text = re.sub(r'#\S+', '', text)
    return text

def remove_links(text):
    text = re.sub(r'http\S+', '', text)
    return text

def remove_underscore(text):
    text = text.replace('_', '')
    return text

def textstatScores(tweets, str):
    print("Completing the text stat scores extraction for " + str)
    fre, fkg, fs, si, ari, cli, lwf, dcrs = [], [], [], [], [], [], [], []
    for i in range(0, len(tweets)):
        tweet = tweets[i]
        tweet = remove_user_names(tweet)
        tweet = remove_hashtags(tweet)
        tweet = remove_links(tweet)
        tweet = remove_underscore(tweet)
        fre.append(textstat.flesch_reading_ease(tweet))
        fkg.append(textstat.flesch_kincaid_grade(tweet))
        fs.append(textstat.gunning_fog(tweet))
        si.append(textstat.smog_index(tweet))
        ari.append(textstat.automated_readability_index(tweet))
        cli.append(textstat.coleman_liau_index(tweet))
        lwf.append(textstat.linsear_write_formula(tweet))
        dcrs.append(textstat.dale_chall_readability_score(tweet))
    features = pd.DataFrame()
    features['flesch reading ease'] = fre
    features['flesch kincaid grade'] = fkg
    features['gunning fog'] = fs
    features['smog index'] = si
    features['automated readability index'] = ari
    features['coleman liau index'] = cli
    features['linsear write formula'] = lwf
    features['dale chall readability score'] = dcrs
    return features

In [4]:
# create an h2o instance
h2o.init()
#nltk.download('stopwords')  # might need if running nltk + stopwords for the first time

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "11.0.2" 2019-01-15 LTS; Java(TM) SE Runtime Environment 18.9 (build 11.0.2+9-LTS); Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.2+9-LTS, mixed mode)
  Starting server from /home/mackenzie/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpu5tl7jg8
  JVM stdout: /tmp/tmpu5tl7jg8/h2o_mackenzie_started_from_python.out
  JVM stderr: /tmp/tmpu5tl7jg8/h2o_mackenzie_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Europe/Vienna
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.5
H2O cluster version age:,23 days
H2O cluster name:,H2O_from_python_mackenzie_79hb59
H2O cluster total nodes:,1
H2O cluster free memory:,1.922 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [5]:
# prepare data for feature extraction, check if paths are correct
filepath_train = "/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/EnglishCleanedTrainingData (1).csv"
filepath_test = "/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/EnglishCleanedTestingData (1).csv"
train_data_h2o = h2o.upload_file(filepath_train) 
test_data_h2o = h2o.upload_file(filepath_test)
train_data = pd.read_csv(filepath_train)
test_data = pd.read_csv(filepath_test)
train_labels = train_data["labels"]
test_labels = test_data["labels"]

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
# Word2Vec generation resulting in pandas data frames
vecs_train = (h2o_w2vec(train_data_h2o['cleaned_tweet'], 'train')).as_data_frame()
vecs_test = (h2o_w2vec(test_data_h2o['cleaned_tweet'], 'test')).as_data_frame()

Break train into sequence of words
Parse progress: |█████████████████████████████████████████████████████████| 100%
Build word2vec model for train
word2vec Model Build progress: |██████████████████████████████████████████| 100%
Break test into sequence of words
Parse progress: |█████████████████████████████████████████████████████████| 100%
Build word2vec model for test
word2vec Model Build progress: |██████████████████████████████████████████| 100%


In [7]:
# NGRAMS generation + Frequency calculation -- NOTE no header column
ngram_train = ngrams(train_data['cleaned_tweet'], 1, 3, 'train') 
ngram_test = ngrams(test_data['cleaned_tweet'], 1, 3, 'test')
# Char-NGRAMS generation + Frequency calculation -- NOTE no header column
char_ngram_train = char_ngrams(train_data['cleaned_tweet'], 2, 5, 'train') 
char_ngram_test = char_ngrams(test_data['cleaned_tweet'], 2, 5, 'test')

Completing ngram generation for train
Completing ngram generation for test
Completing ngram generation for train
Completing ngram generation for test


In [8]:
# TFIDF Generation -- NOTE no header column
tfidf_train = tfidf(train_data['cleaned_tweet'], 1, 3, 'train')
tfidf_test = tfidf(test_data['cleaned_tweet'], 1, 3, 'test')
# TFIDF Generation -- NOTE no header column
char_tfidf_train = char_tfidf(train_data['cleaned_tweet'], 2, 5, 'train')
char_tfidf_test = char_tfidf(test_data['cleaned_tweet'], 2, 5, 'test')

Completing tfidf+ngram generation for train
Completing tfidf+ngram generation for test
Completing tfidf+ngram generation for train
Completing tfidf+ngram generation for test


In [9]:
# Sentiment Analysis into pandas dataframes
sentiment_train = sentimentAnalyzer(train_data['cleaned_tweet'], 'train') 
sentiment_test = sentimentAnalyzer(test_data['cleaned_tweet'], 'test')

Completing the sentiment analysis for train
Completing the sentiment analysis for test


In [10]:
# Liguistic feature extraction
linguistic_train = linguisticFeatures(train_data['tweet'], 'train')
linguistic_test = linguisticFeatures(test_data['tweet'], 'test')

Completing the liguistic feature extraction for train
Completing the liguistic feature extraction for test


In [11]:
# Textstat scores
textstat_train = textstatScores(train_data['tweet'], 'train')
textstat_test = textstatScores(test_data['tweet'], 'test')

Completing the text stat scores extraction for train
Completing the text stat scores extraction for test


In [12]:
# combine all features into one pandas dataframe for train and test
print("Concatenating in process")
training_data = pd.concat([ngram_train, char_ngram_train, tfidf_train, char_tfidf_train, sentiment_train, 
                           linguistic_train, textstat_train, train_labels], axis=1) # took out vecs_train
print("Did it!")

#testing_data = pd.concat([vecs_test, ngram_test, tfidf_test, sentiment_test, test_labels], axis=1) 
#export_csv2 = testing_data.to_csv('english_test_data.csv', index = None, header=True, encoding='utf-8')


Concatenating in process
Did it!


In [13]:
def getData():
    return training_data