In [1]:
import h2o
from h2o.estimators.word2vec import H2OWord2vecEstimator
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer # put up top
#nltk.download('vader_lexicon') -- if first time running uncomment and run this
import pandas as pd
STOP_WORDS = set(stopwords.words('english')) # when german data, use 'german' as parameter
new_stopwords = ['rt'] # when german data take out 'lbr'
STOP_WORDS = STOP_WORDS.union(new_stopwords)

In [2]:
# All Feature Extraction Functions

# functions for H2o Word2Vec
def tokenizeFunc(sentences, stop_word = STOP_WORDS):
    df = sentences.as_data_frame()
    df = df.astype(str)
    sentence = h2o.H2OFrame(python_obj=df, column_types=["string"])
    tokenized = sentence.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

def h2o_w2vec(data, str):
    print("Break " + str + " into sequence of words")
    words = tokenizeFunc(data)
    print("Build word2vec model for " + str)
    w2v_model = H2OWord2vecEstimator(sent_sample_rate=0.0, epochs=10)
    w2v_model.train(training_frame=words)
    vecs = w2v_model.transform(words, aggregate_method="AVERAGE")
    return vecs

# functions for filtering through rows
def doc_generator(filepath, textcol=0, skipheader=True): # might want to make false?
    with open(filepath) as f:
        reader = csv.reader(f)
        if skipheader:
            next(reader, None)
        for row in reader:
            yield row[textcol]

# NGrams Freq function
def ngrams(min_n, max_n, str, filepath, col):
    vectorizer = CountVectorizer(ngram_range=(min_n, max_n), stop_words=set(STOP_WORDS), max_features=10000)
    print("Completing ngram generation for " + str)
    X = vectorizer.fit_transform(doc_generator(filepath, textcol=col)) # for our purposes col=0
    #print("Testing ngram generation for " + str)
    #print(vectorizer.get_feature_names())
    #print("Testing ngram vectors" + str)
    #print(X.toarray())
    ngrams_pd = pd.DataFrame(X.toarray())
    return ngrams_pd

# TFIDF function
def tfidf(min_n, max_n, str, filepath, col):
    vectorizer = TfidfVectorizer(ngram_range=(min_n, max_n), max_features=10000)
    print("Completing tfidf+ngram generation for " + str)
    X = vectorizer.fit_transform(doc_generator(filepath, textcol=col))
    # Testing the TFIDF value + ngrams feature names:
    #print(X.toarray()) 
    #print(vectorizer.get_feature_names())
    tfidf_pd = pd.DataFrame(X.toarray())
    return tfidf_pd

# Sentiment Analysis function
def sentimentAnalyzer(str, data):
    sid = SentimentIntensityAnalyzer()
    data_pd = data.as_data_frame()
    sentiment_val = pd.DataFrame(columns = ['sentiment'])
    print("Completing the sentiment analysis for " + str)
    for i in range(0, len(data_pd)-1):
        tweet = data_pd['cleaned_tweet'].values[i]
        ss = sid.polarity_scores(tweet)
        sentiment_val.at[i, 'sentiment'] = ss.get('compound')
    return sentiment_val

In [3]:
# create an h2o instance
h2o.init()
#nltk.download('stopwords')  # might need if running nltk + stopwords for the first time

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "11.0.2" 2019-01-15 LTS; Java(TM) SE Runtime Environment 18.9 (build 11.0.2+9-LTS); Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.2+9-LTS, mixed mode)
  Starting server from /home/mackenzie/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpb7xqwpfe
  JVM stdout: /tmp/tmpb7xqwpfe/h2o_mackenzie_started_from_python.out
  JVM stderr: /tmp/tmpb7xqwpfe/h2o_mackenzie_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Europe/Vienna
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.5
H2O cluster version age:,"21 days, 15 hours and 20 minutes"
H2O cluster name:,H2O_from_python_mackenzie_smvxwt
H2O cluster total nodes:,1
H2O cluster free memory:,1.922 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [4]:
# prepare data for h2o use, check if paths are correct
filepath_train = "/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/EnglishCleanedTrainingData (1).csv"
filepath_test = "/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/EnglishCleanedTestingData (1).csv"
train_data = h2o.upload_file(filepath_train) 
test_data = h2o.upload_file(filepath_test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [5]:
# Word2Vec generation resulting in pandas data frames
vecs_train = (h2o_w2vec(train_data['cleaned_tweet'], 'train')).as_data_frame()
train_labels = train_data["labels"].as_data_frame()
vecs_test = (h2o_w2vec(test_data['cleaned_tweet'], 'test')).as_data_frame()
test_labels = test_data["labels"].as_data_frame()

Break train into sequence of words


H2OResponseError: Server error java.lang.IllegalArgumentException:
  Error: Column cleaned_tweet not found
  Request: POST /99/Rapids
    data: {'ast': "(tmp= py_1_sid_a5b1 (cols_py Key_Frame__upload_82faca47600d6d4efd422bb96bc4e88.hex 'cleaned_tweet'))", 'session_id': '_sid_a5b1'}


In [None]:
# NGRAMS generation + Frequency calculation -- NOTE no header column
ngram_train_freq = ngrams(1, 3, 'train', filepath_train, 0) 
ngram_test_freq = ngrams(1, 3, 'test', filepath_test, 0)

In [None]:
# TFIDF Generation -- NOTE no header column
tfidf_train = tfidf(1, 3, 'train', filepath_train, 0) 
tfidf_test = tfidf(1, 3, 'test', filepath_test, 0)

In [None]:
# Sentiment Analysis into pandas dataframes
sentiment_train = sentimentAnalyzer('train', train_data['cleaned_tweet']) 
sentiment_test = sentimentAnalyzer('test', test_data['cleaned_tweet'])

In [None]:
# TODO: figure out why mem error for when running training data

# combine all features into one pandas dataframe for train and test
training_data = pd.concat([vecs_train, ngram_train_freq, tfidf_train, sentiment_train, train_labels])
export_csv = training_data.to_csv('english_train_data.csv', index = None, header=True, encoding='utf-8')

#testing_data = pd.concat([vecs_test, ngram_test_freq, tfidf_test, sentiment_test, test_labels]) 
#export_csv2 = testing_data.to_csv('english_test_data.csv', index = None, header=True, encoding='utf-8')
