In [1]:
import h2o
from h2o.estimators.word2vec import H2OWord2vecEstimator
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon') -- if first time running uncomment and run this
import pandas as pd
STOP_WORDS = set(stopwords.words('german')) 
new_stopwords = ['lbr'] 
STOP_WORDS = STOP_WORDS.union(new_stopwords)

In [2]:
# All Feature Extraction Functions

# functions for H2o Word2Vec
def tokenizeFunc(sentences, stop_word = STOP_WORDS):
    df = sentences.as_data_frame()
    df = df.astype(str)
    sentence = h2o.H2OFrame(python_obj=df, column_types=["string"])
    tokenized = sentence.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

def h2o_w2vec(data, str):
    print("Break " + str + " into sequence of words")
    words = tokenizeFunc(data)
    print("Build word2vec model for " + str)
    w2v_model = H2OWord2vecEstimator(sent_sample_rate=0.0, epochs=10)
    w2v_model.train(training_frame=words)
    vecs = w2v_model.transform(words, aggregate_method="AVERAGE")
    return vecs

# functions for filtering through rows
def doc_generator(filepath, textcol=0, skipheader=True): # might want to make false?
    with open(filepath) as f:
        reader = csv.reader(f)
        if skipheader:
            next(reader, None)
        for row in reader:
            yield row[textcol]

# NGrams Freq function
def ngrams(min_n, max_n, str, filepath, col):
    vectorizer = CountVectorizer(ngram_range=(min_n, max_n), stop_words=set(STOP_WORDS), max_features=10000)
    print("Completing ngram generation for " + str)
    X = vectorizer.fit_transform(doc_generator(filepath, textcol=col)) # for our purposes col=0
    #print("Testing ngram generation for " + str)
    #print(vectorizer.get_feature_names())
    #print("Testing ngram vectors" + str)
    #print(X.toarray())
    ngrams_pd = pd.DataFrame(X.toarray())
    return ngrams_pd

# TFIDF function
def tfidf(min_n, max_n, str, filepath, col):
    vectorizer = TfidfVectorizer(ngram_range=(min_n, max_n), max_features=10000)
    print("Completing tfidf+ngram generation for " + str)
    X = vectorizer.fit_transform(doc_generator(filepath, textcol=col))
    # Testing the TFIDF value + ngrams feature names:
    #print(X.toarray()) 
    #print(vectorizer.get_feature_names())
    tfidf_pd = pd.DataFrame(X.toarray())
    return tfidf_pd

# Sentiment Analysis function
def sentimentAnalyzer(str, data):
    sid = SentimentIntensityAnalyzer()
    data_pd = data.as_data_frame()
    sentiment_val = pd.DataFrame(columns = ['sentiment'])
    print("Completing the sentiment analysis for " + str)
    for i in range(0, len(data_pd)-1):
        tweet = data_pd['cleaned_tweet'].values[i]
        ss = sid.polarity_scores(tweet)
        sentiment_val.at[i, 'sentiment'] = ss.get('compound')
    return sentiment_val

In [None]:
# create an h2o instance
h2o.init()
#nltk.download('stopwords')  # might need if running nltk + stopwords for the first time

In [None]:
# prepare data for h2o use, check if paths are correct
filepath_train = "/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/GermanCleanedTrainingData (1).csv"
filepath_test = "/home/mackenzie/workspace/PycharmProjects/DAADRISE_AbusiveLangProject/featureExtraction/GermanCleanedTestingData (1).csv"
train_data = h2o.upload_file(filepath_train) 
test_data = h2o.upload_file(filepath_test)

In [None]:
# Word2Vec generation resulting in pandas data frames
vecs_train = (h2o_w2vec(train_data['cleaned_tweet'], 'train')).as_data_frame()
train_labels = train_data["labels"].as_data_frame()
vecs_test = (h2o_w2vec(test_data['cleaned_tweet'], 'test')).as_data_frame()
test_labels = test_data["labels"].as_data_frame()

In [None]:
# NGRAMS generation + Frequency calculation -- NOTE no header column
ngram_train_freq = ngrams(1, 3, 'train', filepath_train, 0) # unigram for now
ngram_test_freq = ngrams(1, 3, 'test', filepath_test, 0)

In [None]:
# TFIDF Generation -- NOTE no header column
tfidf_train = tfidf(1, 3, 'train', filepath_train, 0) # unigram for now
tfidf_test = tfidf(1, 3, 'test', filepath_test, 0)

In [None]:
# Sentiment Analysis into pandas dataframes
sentiment_train = sentimentAnalyzer('train', train_data['cleaned_tweet']) 
sentiment_test = sentimentAnalyzer('test', test_data['cleaned_tweet'])

In [None]:
# combine all features into one pandas dataframe for train and test
training_data = pd.concat([vecs_train, ngram_train_freq, tfidf_train, sentiment_train, train_labels])
export_csv = training_data.to_csv('german_train_data.csv', index = None, header=True, encoding='utf-8')

testing_data = pd.concat([vecs_test, ngram_test_freq, tfidf_test, sentiment_test, test_labels]) 
export_csv2 = testing_data.to_csv('german_test_data.csv', index = None, header=True, encoding='utf-8')