In [None]:
#import autosklearn.classification
#import sklearn.model_selection
#import sklearn.datasets
#import sklearn.metrics
import pandas as pd
import re
import h2o
from h2o.estimators.word2vec import H2OWord2vecEstimator
import nltk
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english')) # can we take out now that its been preprocessed?
new_stopwords = ['rt', 'co', 'http', 'u', 'got', 'get']
STOP_WORDS = STOP_WORDS.union(new_stopwords)

In [None]:
# used for H2o Word 2 Vec
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

In [None]:
# create an h2o instance
h2o.init()
#nltk.download('stopwords')  # might need if running nltk + stopwords for the first time
train_data = h2o.upload_file("/home/mackenzie/Downloads/EnglishCleanedTrainingData.csv") # check if path correct
test_data = h2o.upload_file("/home/mackenzie/Downloads/EnglishCleanedTestingData.csv") # check if path correct

In [None]:
# Word 2 Vec process, could turn into a function or nah?
print("Break train tweets into sequence of words")
train_words = tokenize(train_data["tweet"])

print("Break test tweets into sequence of words")
test_words = tokenize(test_data["tweet"])

print("Build word2vec model for train")
w2v_model_one = H2OWord2vecEstimator(sent_sample_rate=0.0, epochs=10)
w2v_model_one.train(training_frame=train_words)

print("Build word2vec model for test")
w2v_model_two = H2OWord2vecEstimator(sent_sample_rate=0.0, epochs=10)
w2v_model_two.train(training_frame=test_words)

print("Calculate a vector for each train tweet")
tweet_vecs_train = w2v_model_one.transform(train_words, aggregate_method="AVERAGE")

print("Calculate a vector for each test tweet")
tweet_vecs_test = w2v_model_two.transform(test_words, aggregate_method="AVERAGE")

In [None]:
# prepare word 2 vec back to pandas for autosklearn later
train_tweets_pd = h2o.as_list(tweet_vecs_train, use_pandas=True)
train_labels_pd = h2o.as_list(train_data["labels"], use_pandas=True)
test_tweets_pd = h2o.as_list(tweet_vecs_test, use_pandas=True)
test_labels_pd = h2o.as_list(test_data["labels"], use_pandas=True)

In [None]:
# ngrams function
def time4ngrams(s, n):
    s = s.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s) # lowercases all uppercase words, takes out punctuation, and allows for numbers
    tokens = [token for token in s.split(" ") if token != ""]
    output = list(ngrams(tokens, n)) # the higher the number the smaller the ngrams list
    return output

In [None]:
# running ngrams for training data
tweets_train = pd.Dataframe(train_data["tweet"])
ngrammed_tweets_train = pd.Dataframe(column="tweet") # might not need the parameter
n = 3 # if we want multiple ngrams then will need to add a loop
for t in tweets_train:
    ngrammed_tweets_train["tweet"].append(time4ngrams(t, n))
print(type(ngrammed_tweets_train)) # make sure it's in pandas

tweets_test = pd.Dataframe(test_data["tweet"])
ngrammed_tweets_test = pd.Dataframe(column="tweet") # might not need the parameter
n = 3 # if we want multiple ngrams then will need to add a loop
for t in tweets_test:
    ngrammed_tweets_test["tweet"].append(time4ngrams(t, n))
print(type(ngrammed_tweets_test)) # make sure it's in pandas

In [None]:
# run tfidf on the ngrams
vectorizer = TfidfVectorizer()
got_tfidf = vectorizer.fit_transform(ngrammed_tweets) # the input for tfidf would be the ngrams from above?
tfidf = pd.DataFrame(got_tfidf.toarray())
tfidf.columns = vectorizer.get_feature_names()

In [None]:
# then combine all columns to train features and test features

# once run and no errors, do the same for german autosklearn and then do it for h2o