In [1]:
import h2o
h2o.init()
from h2o.estimators.word2vec import H2OWord2vecEstimator
import nltk
# nltk.download('stopwords') -- might need if running nltk + stopwords for the first time
from nltk.corpus import stopwords
from h2o.automl import H2OAutoML
import pandas as pd
from sklearn import metrics

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,6 days 23 hours 21 mins
H2O cluster timezone:,Europe/Vienna
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.5
H2O cluster version age:,"14 days, 14 hours and 41 minutes"
H2O cluster name:,H2O_from_python_mackenzie_dm3nzl
H2O cluster total nodes:,1
H2O cluster free memory:,1.219 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [25]:
train_data = h2o.upload_file("/home/mackenzie/Downloads/EnglishCleanedTrainingData.csv") # check if path correct 
test_data = h2o.upload_file("/home/mackenzie/Downloads/EnglishCleanedTestingData.csv") # check if path correct
STOP_WORDS = set(stopwords.words('english'))
new_stopwords = ['rt', 'co', 'http', 'u', 'got', 'get']
STOP_WORDS = STOP_WORDS.union(new_stopwords)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [3]:
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

In [4]:
print("Break train tweets into sequence of words")
train_words = tokenize(train_data["tweet"])

Break train tweets into sequence of words


In [5]:
print("Break test tweets into sequence of words")
test_words = tokenize(test_data["tweet"])

Break test tweets into sequence of words


In [6]:
print("Build word2vec model for train")
w2v_model_one = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model_one.train(training_frame=train_words)

Build word2vec model for train
word2vec Model Build progress: |██████████████████████████████████████████| 100%


In [7]:
print("Build word2vec model for test")
w2v_model_two = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model_two.train(training_frame=test_words)

Build word2vec model for test
word2vec Model Build progress: |██████████████████████████████████████████| 100%


In [8]:
# could do synonym check here
#print("Sanity check - find synonyms for the word 'teacher'")
#w2v_model.find_synonyms("teacher", count = 5)

In [9]:
print("Calculate a vector for each train tweet")
tweet_vecs_train = w2v_model_one.transform(train_words, aggregate_method = "AVERAGE")
print(tweet_vecs_train)

Calculate a vector for each train tweet


C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35,C36,C37,C38,C39,C40,C41,C42,C43,C44,C45,C46,C47,C48,C49,C50,C51,C52,C53,C54,C55,C56,C57,C58,C59,C60,C61,C62,C63,C64,C65,C66,C67,C68,C69,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79,C80,C81,C82,C83,C84,C85,C86,C87,C88,C89,C90,C91,C92,C93,C94,C95,C96,C97,C98,C99,C100
0.0223424,-0.08276,0.0387922,0.117605,0.10378,-0.144005,-0.0221281,-0.051491,-0.0144988,0.00602809,0.222964,0.114629,-0.128073,-0.103262,-0.184455,-0.0345309,-0.0431599,0.0863207,-0.202299,0.224225,0.0215012,0.215449,-0.122652,0.0115363,0.208421,0.151796,-0.157925,-0.0497596,0.165338,-0.0447987,-0.0355493,0.197404,-0.302941,0.175013,0.00974624,-0.0929491,-0.165572,-0.047391,-0.228681,-0.0234547,-0.222493,-0.0358672,0.156457,-0.121174,-0.239643,-0.092991,-0.127105,0.0259335,0.100321,-0.0313455,-0.113834,-0.194439,0.183633,0.0233283,0.251628,0.0351301,0.0904468,-0.0372926,-0.0700557,-0.0526358,0.0645934,-0.144038,0.0168826,0.194091,-0.0205084,0.0335193,-0.123215,-0.184751,0.170322,0.0740542,-0.0467131,-0.176256,-0.0881352,-0.0574672,-0.0337711,-0.124141,-0.0452782,0.0344,0.0148767,0.253006,-0.0315287,-0.254408,-0.0812914,0.0343174,0.0772073,-0.0617911,-0.182489,-0.220473,0.0101707,0.0424757,0.14839,-0.0269312,-0.0207631,0.105923,-0.0374845,0.035177,0.0222166,0.194937,0.0802423,0.0933859
0.0381571,0.169677,-0.0946149,0.0558397,0.0266164,-0.0336636,0.0751325,-0.157822,-0.0662286,-0.0330098,0.147064,0.139625,-0.0766034,0.0753417,-0.0813428,0.0705755,-0.179138,0.0661921,-0.158312,0.10535,0.108928,0.264944,0.0136087,0.121737,0.152939,0.147995,0.0068351,-0.00768577,0.179074,0.0671987,-0.00392963,0.0722276,-0.210974,-0.124551,0.0498734,-0.10903,-0.162239,0.0283855,-0.327184,0.0283496,-0.199846,-0.170235,0.0960226,-0.054669,-0.192298,-0.169993,-0.278767,0.0990923,0.0206655,0.0559498,0.0124686,-0.217594,0.0711856,-0.123334,0.146967,-0.123879,-0.0663025,0.0562728,0.0676319,-0.0215916,0.0829631,0.11396,-0.0103352,-0.0999446,0.0175469,0.179792,-0.188006,0.0814401,0.289559,0.136967,-0.0663057,-0.141992,-0.319676,-0.0111743,-0.1079,-0.210902,0.0981346,-0.0665335,0.124551,-0.0121625,-0.00209863,-0.207908,-0.185202,0.0137445,0.107493,-0.0245979,-0.19279,-0.120658,0.00970476,0.1241,0.136172,-0.149274,-0.0558557,0.0765942,0.113389,0.15758,0.118811,0.0921255,0.0523585,0.123106
0.156697,0.18083,-0.0624462,0.0231771,0.0430882,-0.075262,0.00655932,0.0488314,-0.0172671,-0.0445129,0.0983548,0.073057,-0.198447,0.0500282,0.0139142,0.168126,-0.120425,0.0204765,-0.142426,0.0602858,0.122715,0.120716,-0.0905239,0.0472035,0.0387592,0.0989807,-0.116968,-0.170702,0.1045,-0.0339078,-0.0549988,0.0542825,-0.131366,-0.0726431,0.0546257,0.0186057,0.0132361,0.0231228,-0.247582,-0.00924552,-0.227218,-0.0764011,0.0529758,-0.0698664,-0.233807,-0.0769302,-0.201596,0.124064,0.0219946,0.00283156,-0.0465681,-0.351317,0.237269,-0.0779144,0.182555,-0.0879206,-0.0499739,0.0883217,-0.0686838,-0.032174,-0.035887,-0.0704311,-0.00101519,-0.00792553,-0.0839945,0.0255661,-0.131935,-0.0329768,0.135967,0.166322,-0.0279789,-0.0849111,-0.129975,-0.0596101,-0.0739678,-0.247904,0.139237,0.0206789,0.0757857,0.0879412,0.0201232,-0.169221,-0.0679391,0.027705,0.00329124,-0.0867621,-0.178825,-0.264275,-0.004378,0.131204,0.115545,-0.099328,-0.0649829,0.0216034,0.0935499,0.0574648,-0.00661883,0.131036,0.0792279,0.115624
-0.108762,0.224019,0.0491488,-0.0458032,-0.0307439,-0.0665347,-0.0226533,-0.00578452,-0.0755648,0.00195552,0.165161,0.0633906,-0.0841122,0.156898,-0.256173,0.0894957,-0.142387,0.100941,-0.161361,0.0173705,0.093694,0.104144,-0.184319,0.210381,0.0375314,0.0796243,0.0315992,-0.0655946,0.162442,-0.0260141,-0.130015,-0.129768,-0.108081,0.0668278,-0.0368491,-0.0999875,-0.0755118,0.177272,-0.195265,0.0371625,-0.153785,-0.00489053,0.0484189,-0.122989,-0.0417165,-0.0763641,-0.168227,0.20354,0.134843,-0.117026,-0.102748,-0.0650835,0.115502,0.00540144,0.0646049,-0.107289,0.0367208,0.0256136,-0.0240284,-0.103669,0.0299553,0.0214981,0.116108,-0.0437393,0.0347446,0.00999967,0.0370853,-0.0481873,0.0240867,0.197667,-0.232405,0.153374,-0.101111,-0.0810579,-0.0249153,-0.141176,-0.0934557,0.0029746,0.0769511,0.0799526,0.157844,-0.0588758,-0.133657,0.210626,-0.129814,-0.00674226,-0.00200009,-0.0384792,0.0425603,0.104148,0.260929,0.00918147,-0.0756199,0.0261978,0.203517,0.0241982,-0.09687,-0.0195334,0.210521,0.014416
-0.171298,0.172508,0.0522536,0.0168697,0.147797,-0.0792849,-0.0242652,0.060213,0.0323577,0.0214742,0.178588,0.187538,-0.0313689,0.0547712,-0.141935,-0.0642782,-0.190697,0.106084,0.0743484,-0.05701,0.0196388,-0.0617599,-0.16522,0.0184185,-0.0182502,0.135726,-0.0606426,-0.2995,0.158358,-0.0248271,-0.0435638,-0.0164796,-0.0595336,0.138065,0.0680826,0.110457,-0.163902,0.103143,-0.163269,0.0788219,-0.0541276,-0.00234488,0.0860105,0.0145989,-0.201136,-0.139185,-0.251269,0.15199,-0.0286447,0.100983,-0.0158224,-0.0419701,0.0567476,0.0953104,0.165114,-0.139335,-0.123832,0.0763108,-0.0664292,-0.181959,0.154035,-0.139467,-0.0089236,0.110335,0.0139672,-0.0703829,0.086826,-0.196079,0.0827616,-0.00673475,-0.175123,0.0364335,0.0131072,-0.169864,-0.144642,-0.104229,-0.0554649,0.0675057,-0.0348844,0.0520964,-0.0285787,-0.20366,0.0105453,0.0910867,-0.00516657,-0.120538,-0.178393,-0.361721,0.0632365,0.19261,0.142261,-0.0513087,0.0805751,-0.0869257,-0.0275631,0.29951,-0.0251317,0.101992,0.0996594,0.193112
-9.42051e-05,-0.0265923,0.0206741,0.122351,0.0445544,-0.0138616,-0.0167429,0.073755,-0.0510145,-0.100854,0.265726,0.0639582,-0.079262,-0.072736,-0.238711,0.0339785,-0.0489888,0.0476875,-0.000361465,0.0304371,0.0470914,0.12685,-0.136892,0.0154733,-0.0353449,0.229756,-0.160795,-0.239952,0.236432,-0.179699,0.0312122,0.27607,-0.243838,0.218166,-0.044384,0.000354499,0.0141793,-0.136644,-0.0371106,-0.0646406,-0.109234,-0.0326232,0.206716,-0.0279387,-0.229395,-0.0320085,-0.0652559,0.0520165,-0.0782973,-0.00622266,-0.176884,-0.11432,0.270602,0.0243334,0.102654,0.0311921,-0.00489881,-0.0342091,-0.0669661,-0.00224263,0.00890342,-0.172201,0.00341516,0.246365,-0.0368596,-0.207188,0.0308514,-0.109887,0.0766623,0.138734,-0.121039,-0.084086,0.0504322,-0.22281,0.0516483,-0.110684,0.0977797,0.00285231,0.00866116,0.167765,-0.173974,-0.145992,-0.0350052,0.173258,0.000853154,-0.0872269,-0.174797,-0.28874,-0.0103119,-0.0474156,0.05689,-5.72304e-05,-0.109307,0.0406183,-0.022745,0.0511145,-0.148427,0.0894933,0.0553011,0.216045
0.0446506,0.103582,-0.045617,0.0564694,0.190124,-0.206986,-0.0971405,0.0860469,0.0672393,-0.0618046,-0.00259778,0.0868067,-0.155634,0.016893,-0.0445403,-0.144539,0.113119,0.099252,-0.126755,0.137568,-0.0282381,0.122386,0.0172058,0.138858,0.0689298,0.138137,-0.17183,-0.129981,0.056596,-0.144908,-0.118369,-0.19318,-0.168356,-0.16107,0.104508,-0.145765,0.151916,0.0130775,-0.263571,0.241287,-0.306884,0.138342,0.065289,-0.288563,-0.121306,-0.0205827,-0.120503,0.136624,0.25627,-0.0112445,0.0134769,-0.112152,0.291916,0.0564913,0.143526,0.0425589,-0.0693494,-0.0539465,-0.181812,0.0834871,-0.228379,-0.128873,0.1426,0.191611,-0.00824583,-0.0991585,0.186127,-0.304739,0.143483,-0.147782,-0.155402,0.199163,-0.212021,0.0423738,-0.165697,-0.0529032,-0.129729,0.0849725,0.283901,0.118871,-0.190396,-0.125796,-0.0206022,-0.076086,0.0402931,0.00329293,-0.170738,-0.117908,-0.0989784,0.273022,0.112295,-0.155813,-0.284762,0.0386634,0.163719,0.154147,0.0527989,0.140726,0.127203,0.0769367
0.188278,0.0434627,0.047484,-0.0850035,-0.0976758,-0.123307,-0.0183787,0.163295,-0.0845849,-0.0643949,0.0956952,0.0364802,-0.0349721,0.0288983,-0.15988,0.0932515,-0.129394,0.0739778,-0.00885031,0.154165,0.0235392,-0.022209,-0.142635,0.0560876,0.237867,0.152499,-0.179922,-0.16678,0.180905,-0.137063,-0.0998848,0.0180835,-0.347402,0.104217,0.0906803,0.00330935,-0.181469,0.133803,-0.110781,-0.0537764,-0.187926,0.108993,0.0675786,-0.192171,-0.167639,-0.100407,-0.158477,0.0785753,-0.0557424,0.153512,-0.0634054,-0.27308,0.113656,-0.158251,0.0653502,-0.194833,0.00854311,-0.121374,0.0562611,0.0660068,0.0625423,-0.142944,-0.00890338,-0.0707818,0.143149,0.0230598,-0.170716,-0.188209,0.139039,0.110096,-0.058058,0.0410989,-0.267129,0.100261,-0.0737388,-0.0696842,0.1433,-0.0987062,-0.0240282,0.0778442,-0.0237733,-0.0488599,-0.17472,0.205325,0.0465426,-0.00921307,-0.214642,-0.181588,0.0216773,0.228822,0.17549,0.0726149,-0.0851556,0.0619527,-0.0424095,0.0456467,-0.0440015,0.102085,0.0527313,0.209723
-0.00415409,0.230939,-0.0953216,-0.120408,0.143948,-0.112186,-0.0460698,-0.0521157,-0.0352361,0.0160038,0.273965,0.129079,-0.195427,0.226574,-0.294177,-0.0386062,-0.160881,0.133741,-0.0926442,-0.040671,0.249882,0.207681,-0.0465756,-0.0685025,0.16484,0.143259,0.162338,0.215877,0.267279,0.180685,-0.005526,-0.0262352,-0.290422,-0.181897,0.0732409,0.00349713,-0.247377,0.160412,-0.268944,0.112412,-0.132509,0.0306471,0.0404587,0.025246,-0.1582,-0.282189,-0.13899,0.221595,-0.200665,-0.0233079,-0.0220834,-0.161012,0.0860824,-0.0745856,0.146317,-0.215287,0.0435458,0.0264689,0.063589,0.0970948,0.0246886,0.0623019,0.00949776,-0.168643,0.165045,0.163702,-0.0567518,-0.0938379,0.0910423,0.226128,-0.151403,-0.0487102,-0.276322,-0.0625076,-0.0809528,-0.0982236,-0.167525,-0.10465,0.0133093,0.0975246,-0.0410549,-0.0520372,-0.0447321,0.0385116,0.105534,-0.131119,-0.129714,0.0207184,0.0167182,0.234237,0.137796,-0.23921,-0.113393,0.201833,0.228814,0.284622,0.0935076,0.0610431,0.159202,-0.00155751
0.0712138,0.121804,0.0526673,0.00265834,0.0615845,-0.0271408,0.027376,-0.0699017,-0.0249849,-0.0120433,0.15249,0.01893,-0.0287104,0.0570224,-0.202543,-0.00525314,-0.112002,0.0804026,-0.0683476,0.054021,-0.100613,0.125214,-0.0365284,0.086826,-0.00805105,0.172307,-0.104552,-0.163813,0.111874,-0.10876,0.0931141,-0.0774039,-0.191603,0.0244768,0.069691,-0.0213716,0.0725572,0.0772842,-0.0168696,0.0801887,-0.131702,-0.0507287,0.0953796,-0.0605326,-0.168296,-0.143995,-0.114551,0.116575,0.0286817,-0.088713,-0.0645511,-0.115604,0.134006,0.0672288,0.177058,-0.0722164,-0.0254155,-0.0320576,-0.0719592,-0.0112288,0.0117968,-0.0666868,-0.00269597,0.0379023,-0.0713167,-0.0251271,-0.000725147,-0.0470236,0.197153,0.224795,-0.147377,0.0152512,-0.101577,-0.123619,0.0392174,-0.0916335,0.0403953,-0.0118394,0.0536049,0.0196374,-0.0173143,-0.127685,0.0108943,0.139652,-0.0226088,-0.00719462,-0.0515537,-0.128429,0.090842,0.0662319,0.11165,-0.081591,-0.0879537,0.127007,0.0684104,0.0904563,-0.137921,0.0301356,0.106754,0.147597





In [10]:
print("Calculate a vector for each test tweet")
tweet_vecs_test = w2v_model_two.transform(test_words, aggregate_method = "AVERAGE")

Calculate a vector for each test tweet


In [11]:
# see lines 10 to 12 in the demo for word2Vec for using ml on the vecs and such
#print(tweet_vecs)

In [12]:
# preparing the train and test data sets
# now convert tweet vecs and labels to a pandas dataframe and back to h2o dataframe
# data = job_titles[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:])
train = tweet_vecs_train.cbind(train_data["labels"])
test = tweet_vecs_test.cbind(test_data["labels"])


In [13]:
# more on data prep
x = train.columns         # x: A list/vector of predictor column names or indexes. 
                          # This argument only needs to be specified if the user wants to exclude columns from the 
                          # set of predictors. If all columns (other than the response) should be used in prediction, 
                          # then this does not need to be set.

y = "labels"              # This argument is the name (or index) of the response column
x.remove(y)

# need to set train and test
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [14]:
# now the AUTO-ML piece comes in
aml = H2OAutoML(max_models=10) #max_models=10 or 20?, seed?
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [15]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

model_id,mean_per_class_error,logloss,rmse,mse
StackedEnsemble_BestOfFamily_AutoML_20190703_143453,0.379212,0.352589,0.3191,0.101825
StackedEnsemble_AllModels_AutoML_20190703_143453,0.380945,0.351513,0.31898,0.101748
GLM_grid_1_AutoML_20190703_143453_model_1,0.384846,0.347966,0.319396,0.102014
GBM_1_AutoML_20190703_143453,0.38608,0.343009,0.318445,0.101407
GBM_2_AutoML_20190703_143453,0.3924,0.343937,0.318668,0.101549
XGBoost_1_AutoML_20190703_143453,0.394923,0.338059,0.316211,0.0999894
GBM_3_AutoML_20190703_143453,0.395216,0.347392,0.319843,0.102299
XGBoost_2_AutoML_20190703_143453,0.397681,0.344188,0.321341,0.10326
GBM_4_AutoML_20190703_143453,0.401665,0.352246,0.319949,0.102368
XGBoost_3_AutoML_20190703_143453,0.410509,0.35616,0.328232,0.107736




In [16]:
# The leader model is stored here
aml.leader

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_AutoML_20190703_143453
No model summary for this model


ModelMetricsMultinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.04370837949664802
RMSE: 0.20906549092724036

ModelMetricsMultinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.10182483318575941
RMSE: 0.31910003632992495




In [17]:
preds = aml.predict(test)
print(preds)

stackedensemble prediction progress: |████████████████████████████████████| 100%


predict,p0,p1,p2
2,0.107316,0.170745,0.721938
2,0.0450257,0.0502032,0.904771
2,0.16064,0.226383,0.612977
2,0.0877273,0.122501,0.789772
2,0.0917272,0.131997,0.776276
2,0.22616,0.116045,0.657795
2,0.0590925,0.0577924,0.883115
2,0.0954635,0.126629,0.777908
2,0.107068,0.218627,0.674305
2,0.0656908,0.0806016,0.853708





In [18]:
var = preds["predict"].cbind(test[y])
print(var)

predict,labels
2,1
2,1
2,2
2,1
2,1
2,1
2,1
2,1
2,1
2,0





In [21]:
# convert to pandas dataframe
y_test = h2o.as_list(test[y], use_pandas=True)
y_pred = h2o.as_list(preds["predict"])
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))

[[   9    9  268]
 [  28   51 3759]
 [   4   64  765]]
0.16643130925963284
