In [1]:
import h2o
from h2o.estimators.word2vec import H2OWord2vecEstimator
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer # put up top
#nltk.download('vader_lexicon') -- if first time running uncomment and run this
import pandas as pd
STOP_WORDS = set(stopwords.words('english')) 
new_stopwords = ['rt']
STOP_WORDS = STOP_WORDS.union(new_stopwords)

In [2]:
# create an h2o instance
h2o.init()
#nltk.download('stopwords')  # might need if running nltk + stopwords for the first time

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,1 day 0 hours 1 min
H2O cluster timezone:,Europe/Vienna
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.5
H2O cluster version age:,20 days
H2O cluster name:,H2O_from_python_mackenzie_43roa0
H2O cluster total nodes:,1
H2O cluster free memory:,1.858 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [3]:
# prepare data for h2o use
# check if paths are correct
filepath_train = "/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/EnglishCleanedTrainingData (1).csv"
filepath_test = "/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/EnglishCleanedTestingData (1).csv"
train_data = h2o.upload_file(filepath_train) 
test_data = h2o.upload_file(filepath_test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
# functions for H2o Word2Vec
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

def h2o_w2vec(data, str):
    print("Break " + str + " into sequence of words")
    words = tokenize(data)
    print("Build word2vec model for " + str)
    w2v_model = H2OWord2vecEstimator(sent_sample_rate=0.0, epochs=10)
    w2v_model.train(training_frame=words)
    vecs = w2v_model.transform(words, aggregate_method="AVERAGE")
    return vecs

In [13]:
# Word2Vec generation resulting in pandas data frames
vecs_train = (h2o_w2vec(train_data['tweet'], 'train')).as_data_frame()
train_labels = train_data["labels"].as_data_frame()
vecs_test = (h2o_w2vec(test_data['tweet'], 'test')).as_data_frame()
test_labels = test_data["labels"].as_data_frame()

Break train into sequence of words


H2OResponseError: Server error java.lang.IllegalArgumentException:
  Error: tokenize() requires all input columns to be of a String type. Received Enum. Please convert column to a string column first.
  Request: POST /99/Rapids
    data: {'ast': "(tmp= py_195_sid_9388 (>= (strlen (tolower (tokenize (cols_py Key_Frame__upload_b7d1c773fc1eebdeb85706106a364ff8.hex 'tweet') '\\\\W+'))) 2))", 'session_id': '_sid_9388'}


In [18]:
def doc_generator(filepath, textcol=0, skipheader=True): # might want to make false?
    with open(filepath) as f:
        reader = csv.reader(f)
        if skipheader:
            next(reader, None)
        for row in reader:
            yield row[textcol]
            
def ngrams(min_n, max_n, str, filepath, col):
    vectorizer = CountVectorizer(ngram_range=(min_n, max_n), stop_words=set(STOP_WORDS))
    print("Completing ngram generation for " + str)
    X = vectorizer.fit_transform(doc_generator(filepath, textcol=col)) # for our purposes col=0
    #print("Testing ngram generation for " + str)
    #print(vectorizer.get_feature_names())
    #print("Testing ngram vectors" + str)
    #print(X.toarray())
    X_arr = X.toarray()
    X_pd = pd.DataFrame(X_arr)
    return X_pd

In [19]:
# NGRAMS generation + Frequency calculation -- NOTE no header column

ngram_train_freq = ngrams(1, 1, 'train', filepath_train, 0) # unigram for now
ngram_test_freq = ngrams(1, 3, 'test', filepath_test, 0)

Completing ngram generation for train
   0     1     2     3     4     5     6     7     8     9     ...  7534  \
0     0     0     0     0     0     0     0     0     0     0  ...     0   
1     0     0     0     0     0     0     0     0     0     0  ...     0   
2     0     0     0     0     0     0     0     0     0     0  ...     0   
3     0     0     0     0     0     0     0     0     0     0  ...     0   
4     0     0     0     0     0     0     0     0     0     0  ...     0   

   7535  7536  7537  7538  7539  7540  7541  7542  7543  
0     0     0     0     0     0     0     0     0     0  
1     0     0     0     0     0     0     0     0     0  
2     0     0     0     0     0     0     0     0     0  
3     0     0     0     0     0     0     0     0     0  
4     0     0     0     0     0     0     0     0     0  

[5 rows x 7544 columns]
Completing ngram generation for test
   0      1      2      3      4      5      6      7      8      9      ...  \
0      0      0

In [16]:
# TFIDF function
def tfidf(min_n, max_n, str, filepath, col):
    vectorizer = TfidfVectorizer(ngram_range=(min_n, max_n))
    print("Completing tfidf+ngram generation for " + str)
    X = vectorizer.fit_transform(doc_generator(filepath, textcol=col))
    # Testing the TFIDF value + ngrams feature names:
    #print(X.toarray()) 
    #print(vectorizer.get_feature_names())
    X_arr = X.toarray()
    X_pd = pd.DataFrame(X_arr)
    return X_pd

In [17]:
# TFIDF Generation -- NOTE no header column

tfidf_train = tfidf(1, 1, 'train', filepath_train, 0) # unigram for now
tfidf_test = tfidf(1, 3, 'test', filepath_test, 0)

Completing tfidf+ngram generation for train
Completing tfidf+ngram generation for test


In [6]:
def sentimentAnalyzer(str, data):
    sid = SentimentIntensityAnalyzer()
    data_pd = data.as_data_frame()
    sentiment_val = pd.DataFrame(columns = ['sentiment'])
    print("Completing the sentiment analysis for " + str)
    for i in range(0, len(data_pd)-1):
        tweet = data_pd['tweet'].values[i]
        ss = sid.polarity_scores(tweet)
        sentiment_val.at[i, 'sentiment'] = ss.get('compound')
    return sentiment_val

In [7]:
# Sentiment Analysis into pandas dataframes

sentiment_train = sentimentAnalyzer('train', train_data) 
sentiment_test = sentimentAnalyzer('test', test_data)

Completing the sentiment analysis for train
Completing the sentiment analysis for test


In [None]:
# combine all features into one pandas dataframe 


# convert from pandas --> h2o frame
#train_data_h2o = h2o.H2OFrame(pandasTrain_dataframe)
#test_data_h2o = h2o.H2OFrame(pandasTest_dataframe)