In [1]:
import h2o
from h2o.estimators.word2vec import H2OWord2vecEstimator
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english')) 
new_stopwords = ['rt']
STOP_WORDS = STOP_WORDS.union(new_stopwords)

In [2]:
# create an h2o instance
h2o.init()
#nltk.download('stopwords')  # might need if running nltk + stopwords for the first time

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,5 hours 54 mins
H2O cluster timezone:,Europe/Vienna
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.5
H2O cluster version age:,19 days
H2O cluster name:,H2O_from_python_mackenzie_j6zlxj
H2O cluster total nodes:,1
H2O cluster free memory:,1.638 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [3]:
# prepare data for h2o use
# check if paths are correct
filepath_train = "/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/EnglishCleanedTrainingData (1).csv"
filepath_test = "/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/EnglishCleanedTestingData (1).csv"
train_data = h2o.upload_file(filepath_train) 
test_data = h2o.upload_file(filepath_test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
# functions for H2o Word2Vec
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

def h2o_w2vec(data, str):
    print("Break " + str + " into sequence of words")
    words = tokenize(data)
    print("Build word2vec model for " + str)
    w2v_model = H2OWord2vecEstimator(sent_sample_rate=0.0, epochs=10)
    w2v_model.train(training_frame=words)
    vecs = w2v_model.transform(words, aggregate_method="AVERAGE")
    return vecs

In [5]:
# Word2Vec generation
vecs_train = h2o_w2vec(train_data['tweet'], 'train')
train_labels = train_data["labels"]
vecs_test = h2o_w2vec(test_data['tweet'], 'test')
test_labels = test_data["labels"]


Break train into sequence of words
Build word2vec model for train
word2vec Model Build progress: |██████████████████████████████████████████| 100%
Break test into sequence of words
Build word2vec model for test
word2vec Model Build progress: |██████████████████████████████████████████| 100%


In [19]:
def doc_generator(filepath, textcol=0, skipheader=True): # might want to make false?
    with open(filepath) as f:
        reader = csv.reader(f)
        if skipheader:
            next(reader, None)
        for row in reader:
            yield row[textcol]
            
def ngrams(min_n, max_n, str, filepath, col):
    vectorizer = CountVectorizer(ngram_range=(min_n, max_n), stop_words=set(STOP_WORDS))
    print("Completing ngram generation for " + str)
    X = vectorizer.fit_transform(doc_generator(filepath, textcol=col)) # for our purposes col=0
    # print(X.toarray()) -- get a memory error when i try and run this
    #print("Testing ngram generation for " + str)
    #print(vectorizer.get_feature_names())
    #X_arr = X.toarray() 
    #print(len(X_arr))
    #print("Testing ngram vectors" + str)
    return X.toarray()

In [23]:
# NGRAMS generation + Frequency calculation -- getting memory error when convertint to pandas

ngram_train_freq = ngrams(5, 5, 'train', filepath_train, 0)
#ngram_test_freq = ngrams(2, 2, 'test', filepath_test, 0)

Completing ngram generation for train
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [16]:
# TFIDF function
def tfidf(min_n, max_n, str, filepath, col):
    vectorizer = TfidfVectorizer(ngram_range=(min_n, max_n))
    X = vectorizer.fit_transform(doc_generator(filepath, textcol=col))
    # Testing the TFIDF value + ngrams:
    #print(X.toarray()) -- get a memory error when i try and run this
    #print(vectorizer.get_feature_names())
    return X.toarray()

In [17]:
# TFIDF Generation -- getting memory error when convertint to pandas

tfidf_train = tfidf(5, 5, 'train', filepath_train, 0)
#tfidf_test = tfidf(2, 2, 'test', filepath_test, 0)