In [1]:
import h2o
h2o.init()
from h2o.estimators.word2vec import H2OWord2vecEstimator
import nltk
# nltk.download('stopwords') -- might need if running nltk + stopwords for the first time
from nltk.corpus import stopwords
from h2o.automl import H2OAutoML
import pandas as pd
from sklearn import metrics

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,6 days 22 hours 38 mins
H2O cluster timezone:,Europe/Vienna
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.5
H2O cluster version age:,"14 days, 13 hours and 58 minutes"
H2O cluster name:,H2O_from_python_mackenzie_dm3nzl
H2O cluster total nodes:,1
H2O cluster free memory:,1.289 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [28]:
train_data = h2o.upload_file("/home/mackenzie/Downloads/GermanCleanedTrainingData.csv") # check if path correct 
test_data = h2o.upload_file("/home/mackenzie/Downloads/GermanCleanedTestingData.csv") # check if path correct
STOP_WORDS = set(stopwords.words('german'))
new_stopwords = ['|lbr|', 'ja']
STOP_WORDS = STOP_WORDS.union(new_stopwords)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [3]:
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

In [5]:
print("Break train tweets into sequence of words")
train_words = tokenize(train_data["tweet"])

Break train tweets into sequence of words


In [6]:
print("Break test tweets into sequence of words")
test_words = tokenize(test_data["tweet"])

Break test tweets into sequence of words


In [7]:
print("Build word2vec model for train")
w2v_model_one = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model_one.train(training_frame=train_words)

Build word2vec model for train
word2vec Model Build progress: |██████████████████████████████████████████| 100%


In [8]:
print("Build word2vec model for test")
w2v_model_two = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model_two.train(training_frame=test_words)

Build word2vec model for test
word2vec Model Build progress: |██████████████████████████████████████████| 100%


In [9]:
# could do synonym check here
#print("Sanity check - find synonyms for the word 'teacher'")
#w2v_model.find_synonyms("teacher", count = 5)

In [15]:
print("Calculate a vector for each train tweet")
tweet_vecs_train = w2v_model_one.transform(train_words, aggregate_method = "AVERAGE")
print(tweet_vecs_train)

Calculate a vector for each train tweet


C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35,C36,C37,C38,C39,C40,C41,C42,C43,C44,C45,C46,C47,C48,C49,C50,C51,C52,C53,C54,C55,C56,C57,C58,C59,C60,C61,C62,C63,C64,C65,C66,C67,C68,C69,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79,C80,C81,C82,C83,C84,C85,C86,C87,C88,C89,C90,C91,C92,C93,C94,C95,C96,C97,C98,C99,C100
0.0291293,-0.0563549,-0.00309205,-0.00874928,-0.152444,0.076329,0.0197836,-0.156682,-0.0311497,0.0331275,-0.0514102,-0.00759734,-0.0921724,0.23417,0.0751271,-0.0647309,0.0425146,-0.0339248,-0.109768,0.00189091,-0.163299,0.00792546,-0.133227,-0.0124696,-0.04783,-0.0585472,-0.112219,0.0745889,0.137946,0.0150429,-0.0224849,-0.0227682,0.0939694,-0.029053,-0.0433851,0.132503,0.0283596,-0.0460802,-0.107062,0.0312514,-0.0225912,-0.0677194,0.131732,-0.095006,0.0392678,0.0414449,-0.0547527,0.181339,-0.0807071,0.0836181,0.0363452,-0.131691,-0.143266,-0.0952793,0.110587,0.00497621,0.12193,-0.0160294,-0.108785,-0.0827263,-0.257084,0.0225244,0.0258357,-0.0744465,-0.00954591,-0.0961743,0.0617737,0.0528972,-0.0484603,-0.147192,0.0716757,-0.0445438,-0.0614128,-0.00830384,0.0334794,0.0170624,0.0135088,-0.0303346,-0.0133053,-0.0388469,-0.0988347,0.0793318,-0.0128315,0.0770838,0.101013,0.0268078,0.00296872,-0.0679003,0.000586955,0.118864,-0.0693687,0.134611,-0.044851,0.0467843,-0.0702371,-0.0054008,0.00354876,0.069282,-0.122692,-0.0854221
-0.020289,0.115269,-0.0016796,-0.0155942,0.0243202,-0.0660598,0.0138952,-0.04091,-0.0727459,-0.0206298,0.00772251,-0.0484347,-0.00341719,0.193299,0.246281,0.0759564,0.0430066,-0.105908,-0.160268,-0.038794,-0.229615,-0.0364317,-0.0827229,0.112615,0.0202405,-0.134642,-0.127648,0.0513902,0.0248847,-0.0323642,0.0434213,-0.122694,0.0503713,0.110142,0.0843856,-0.14796,0.144207,0.160098,-0.195316,-0.0866595,0.074925,-0.00350691,0.260468,-0.112127,0.118457,-0.0274727,0.0241238,0.074524,-0.0520599,0.0683611,-0.160389,-0.0189103,-0.121003,0.165582,0.253134,0.0796824,0.186832,-0.0324347,-0.0109388,-0.0373621,-0.249126,-0.031786,0.0882836,0.114533,0.0915308,-0.183541,0.171694,0.0722871,-0.115099,-0.193177,0.107867,0.114245,-0.197168,-0.121551,-0.0254883,0.0653016,-0.183877,-0.0114236,-0.149496,-0.095159,-0.179325,0.0164512,-0.117974,-0.11533,0.22554,-0.177961,0.00298017,-0.179795,0.0260792,0.0559971,-0.233959,0.174775,-0.223886,0.0194388,-0.0910845,0.151882,0.022251,-0.0155746,0.0478414,0.0880277
-0.0253428,0.056287,0.0363188,0.0172438,-0.164082,0.166106,0.0835552,-0.0867098,0.0391808,0.0792269,-0.0338235,0.073912,-0.0455957,0.172466,0.198691,0.101024,0.0017131,0.0213559,-0.213391,-0.0625758,-0.160427,-0.0760626,-0.12163,-0.0435556,-0.0440114,-0.0471189,-0.118607,0.0920026,0.0898629,0.0988965,-0.0275485,-0.0644385,0.0520332,0.0581712,0.0420209,0.0264621,0.0388987,0.0670861,-0.0751736,0.0665786,-0.07797,-0.0263054,0.210391,-0.0714467,0.00182106,0.114163,0.0322469,0.1208,-0.0654437,0.109724,0.0599284,-0.124671,-0.120932,-0.00315898,0.0103579,-0.12295,0.0913121,-0.0261126,-0.0824543,-0.0242883,-0.145794,0.0611703,0.00813424,-0.0688637,0.1508,-0.00360739,0.0921811,0.138241,-0.117111,-0.235415,0.0841958,0.0880682,-0.133225,-0.0321164,0.0641001,-0.053368,-0.0726977,-0.145889,0.0520019,-0.0718572,-0.176017,0.0371041,0.158674,0.0554042,0.00848453,-0.041267,0.0150014,-0.0163349,-0.00370767,0.175144,0.0130477,0.112052,-0.0536757,-0.098824,-0.0131357,0.0453706,-0.0152432,-0.0348165,-0.133764,0.0362345
0.0560245,-0.058401,0.164942,0.0703506,-0.0476878,0.164793,-0.0499999,-0.123435,0.0924741,0.0141972,-0.130689,0.130842,-0.0498206,0.201381,0.120125,-0.0492466,0.0687634,-0.107686,0.0187308,0.0305042,-0.0427067,0.104504,-0.235232,-0.0924881,0.044094,-0.0854511,-0.0261955,0.0378974,0.0188159,0.0677833,0.00812819,0.0502066,0.130502,0.0790179,0.017534,0.0480997,0.0252324,0.0664753,-0.167453,0.0103843,-0.0194064,0.0506962,0.0842848,-0.103715,-0.0548812,0.100362,-0.0418,0.168366,-0.0749091,0.0592347,0.0521234,-0.0215844,-0.239579,-0.0246836,0.0610119,0.0842476,0.0179112,-0.077004,-0.0803946,0.0655031,-0.0363835,0.0431757,-0.0361783,-0.0289436,-0.0138448,-0.00692997,0.115591,0.110991,-0.267832,-0.223705,-0.109144,0.0978345,-0.103648,0.112352,0.0791176,0.0416704,0.0454306,-0.0848762,0.0223682,-0.0579225,-0.156942,0.0465229,0.142546,0.143273,0.0130833,-0.0891467,-0.0428683,0.0168825,0.0777534,0.221181,0.026618,-0.0342779,-0.032666,-0.0555018,-0.0287678,-0.00442444,0.110649,-0.0292328,0.00287684,-0.0820382
0.0816614,-0.0780442,-0.0598833,0.00596877,-0.0356241,0.0600756,-0.0120097,-0.128025,0.0461881,0.0219284,0.00969116,0.0402924,-0.0797552,0.127467,0.118226,-0.0114146,0.132465,-0.0813027,-0.128159,0.102426,-0.0481983,-0.0111946,-0.137662,-0.101156,0.0637788,-0.0903994,-0.0557284,0.0633556,0.0491887,0.13394,-0.0165187,-0.0461188,0.138623,6.53201e-05,-0.0299311,0.0869837,0.0360627,-0.100593,-0.115355,-0.0180725,-0.0628791,-0.00579671,0.157578,-0.00367089,0.0160467,0.111118,-0.0840824,0.143732,-0.077691,0.123589,0.0437436,-0.041495,-0.144288,-0.0260647,0.0998682,-0.016816,0.090609,-0.0878382,-0.0130674,-0.048404,-0.0714008,-0.115815,-0.0623054,-0.0636015,0.0256092,-0.0905298,0.0607689,0.114946,-0.106374,-0.149568,0.00323496,0.042426,-0.0804212,-0.0239731,-0.00435321,0.0356099,0.0116126,-0.0377804,0.0634597,-0.136348,-0.12801,0.0137335,0.087654,-0.0268322,0.00383532,-0.00683491,-0.00296366,0.0537765,-0.0124257,0.139096,-0.0749791,0.106055,0.00197271,0.00428118,-0.0107872,0.000889113,0.0497044,-0.148322,0.0434185,-0.065695
0.0588979,0.0570993,-0.140272,-0.102188,-0.108742,0.0883017,0.0580401,-0.118343,-0.0337609,-0.0736047,-0.0761339,0.0134348,0.00463673,0.221151,0.115617,0.101364,-0.0879726,-0.138183,-0.266936,-0.109993,-0.103532,0.0737391,0.0546819,-0.0766494,-0.166165,-0.145624,-0.0967653,-0.0210012,0.171621,0.122218,0.0486323,0.0027657,-0.0124686,0.0106827,-0.0433788,0.0674223,-0.00999191,0.0485932,0.0810674,0.0579936,0.0322392,0.0823334,0.241588,-0.0414941,0.129983,0.0407565,-0.0532714,-0.0264346,0.0450739,-0.0179341,-0.0197592,0.0629368,-0.117673,0.159071,0.00508235,-0.239444,0.0200305,0.0165043,-0.206214,0.0647554,-0.317827,-0.106039,0.0829522,-0.0550845,-0.0678472,-0.0019142,0.0392503,-6.21565e-05,0.0289005,-0.146311,0.194452,-0.159841,0.0113721,0.130924,-0.025558,-0.0763924,-0.0614581,0.0546804,-0.0107104,0.0176061,-0.212826,-0.102521,0.0186372,0.16578,0.0122007,-0.00715518,0.0921436,0.0200987,0.0816562,0.0503858,-0.237084,0.0716334,-0.0320839,-0.150674,-0.0599328,-0.0558207,-0.0663496,0.0590676,-0.0375327,0.0295214
0.186663,-0.0507311,0.0539503,-0.0839895,-0.0833662,0.12173,-0.0230183,-0.172209,-0.0418314,0.136664,-0.0648005,0.186676,-0.0213429,0.115238,-0.073501,-0.227613,0.0757635,-0.070878,-0.173977,-0.0660629,-0.103309,0.162988,-0.0685334,-0.0575773,-0.106442,-0.122241,-0.0469066,0.00223323,0.231168,0.108209,-0.0603355,0.0598888,-0.0582106,-0.131823,0.0898541,0.000964465,0.0255565,0.0881339,-0.185098,-0.0146972,-0.118864,0.00506866,0.102861,0.0567685,0.0210363,0.179159,0.149468,0.151886,-0.0193224,-0.00350115,0.108153,0.0213718,-0.191998,-0.0569854,0.014663,-0.000596844,0.102807,0.132164,-0.101176,0.134587,-0.127595,0.0288455,-0.0332076,-0.104451,0.0814004,0.0659895,-0.00371389,0.0364875,-0.0714052,-0.214659,-0.0868865,-0.0150597,0.0169528,0.247168,-0.00704149,0.045046,0.0129906,-0.077777,0.117486,-0.0225229,-0.130212,0.0644283,0.0869275,0.219762,-0.0433337,-0.00982364,0.0980631,0.0131377,0.00151722,0.219041,-0.103348,0.0600147,0.0983846,0.0194526,-0.0743883,0.0034479,0.0269722,0.00519634,-0.0716108,-0.0588448
0.111861,-0.0175778,-0.0455895,0.0584721,-0.159905,0.268178,-0.114498,-0.0408069,-0.00405423,0.00289692,0.0535278,0.0540287,0.113725,0.338998,0.0912385,-0.144772,-0.100135,-0.072596,-0.105775,0.00607172,-0.0466071,-0.0221772,-0.1533,-0.00870052,-0.0584107,-0.134375,-0.179873,-0.0187242,0.279432,0.0432753,-0.0623145,-0.0138881,0.15119,0.0248224,-0.0609419,0.0644006,0.0605424,-0.0591285,-0.186449,0.0828806,-0.0793485,0.0602771,-0.0155721,-0.00694167,-0.229805,0.1632,-0.102686,0.185344,-0.0880311,0.0912871,0.00353745,-0.141722,-0.144983,0.0496228,0.00212094,0.171326,0.109481,-0.01777,-0.12266,-0.0454299,-0.235575,-0.0962572,0.084789,-0.134774,0.00163609,0.00828725,0.0141876,0.134536,-0.174497,-0.208365,0.0405273,-0.0526065,-0.102353,0.0727464,-0.0225736,0.116719,0.210598,-0.0794329,0.00954232,-0.0685034,-0.0918756,0.0653776,0.0242512,0.108387,-0.119456,0.0387405,0.00493584,0.0488159,-0.00779908,0.145253,-0.0450449,0.020908,0.0451345,-0.150215,-0.0437348,-0.111525,0.13206,0.0866566,-0.0295329,-0.109833
0.0595674,-0.0600796,0.0402223,0.100839,-0.101873,0.0647246,-0.0400216,-0.0892553,-0.0161753,-0.0128828,-0.0198632,0.116296,0.0301396,0.157443,0.061145,0.028861,-0.00964695,-0.0865227,-0.0144108,0.0938461,-0.0109913,0.0808375,-0.0833464,-0.0386479,0.0223035,-0.0925334,-0.0470514,0.0623201,0.148092,0.0202818,0.106132,-0.037291,0.130959,0.0913292,-0.0142165,0.0148817,0.01491,-0.0447527,-0.0789681,0.0914419,-0.0455416,-0.0343969,-0.0461367,-0.0188458,-0.0127287,0.0644116,0.0127725,0.136421,-0.00084687,-0.107099,-0.0231657,-0.0213305,-0.0708998,-0.107018,0.0457132,0.152501,0.117874,0.0621956,-0.122337,-0.0103001,-0.0645658,0.0392037,0.0507403,-0.120631,0.00179547,0.0104878,0.0642404,0.0819323,-0.145596,-0.0552142,0.0773244,0.0298298,-0.112675,0.0294968,0.0805813,-0.0112096,0.082004,-0.130557,0.00156025,-0.0971742,0.00964811,0.0156337,0.0431134,0.0400819,-0.0771784,-0.0538939,-0.0536644,-0.0306815,0.0609479,0.12467,-0.00162863,0.0693577,-0.040389,-0.0117826,0.0292428,-0.0290265,0.0178953,0.00956747,0.0838614,-0.0676301
0.147115,-0.0157101,0.0844223,0.0204363,0.0203818,0.071912,0.0839601,-0.225012,-0.0403886,0.163411,-0.0691409,0.0815342,-0.12524,0.152463,0.125445,-0.0749193,0.107149,-0.0668527,0.0464024,-0.0470891,0.0260911,0.0108472,-0.25383,-0.0719861,0.074585,-0.206148,-0.0773064,-0.0266801,-0.0585777,0.0935458,0.0208948,0.0575537,0.184953,0.10755,-0.0707052,0.0220272,0.107746,0.0827908,-0.117999,0.123667,0.00370945,-0.156073,0.0702237,0.0761836,0.0213602,0.0905076,0.0545999,0.240133,-0.0103694,-0.0532551,0.034129,-0.200017,-0.0905395,-0.128142,0.0683256,0.18673,0.103996,-0.13036,-0.222365,-0.0416162,-0.0620939,-0.0717059,0.110034,-0.0202052,-0.0986861,0.0110627,0.115435,0.137496,-0.244425,-0.221579,0.0172515,0.120601,-0.0653628,0.0760805,0.113961,-0.0245529,-0.0351144,-0.0542886,0.028838,-0.0749724,-0.164878,0.00194392,0.135689,-0.141336,-0.0240333,0.0298595,-0.114962,0.0182088,-0.0962604,0.0855755,-0.110828,0.0219161,-0.104505,0.015987,0.0499764,0.0144893,0.174896,-0.0702601,-0.0616357,-0.104388





In [11]:
print("Calculate a vector for each test tweet")
tweet_vecs_test = w2v_model_two.transform(test_words, aggregate_method = "AVERAGE")

Calculate a vector for each test tweet


In [12]:
# see lines 10 to 12 in the demo for word2Vec for using ml on the vecs and such
#print(tweet_vecs)

In [16]:
# preparing the train and test data sets
# now convert tweet vecs and labels to a pandas dataframe and back to h2o dataframe
# data = job_titles[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:])
train = tweet_vecs_train.cbind(train_data["labels"])
test = tweet_vecs_test.cbind(test_data["labels"])


In [17]:
# more on data prep
x = train.columns         # x: A list/vector of predictor column names or indexes. 
                          # This argument only needs to be specified if the user wants to exclude columns from the 
                          # set of predictors. If all columns (other than the response) should be used in prediction, 
                          # then this does not need to be set.

y = "labels"              # This argument is the name (or index) of the response column
x.remove(y)

# need to set train and test
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [18]:
# now the AUTO-ML piece comes in
aml = H2OAutoML(max_models=10) #max_models=10 or 20?, seed?
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [19]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

model_id,mean_per_class_error,logloss,rmse,mse
StackedEnsemble_AllModels_AutoML_20190703_140214,0.576218,0.741973,0.501935,0.251938
StackedEnsemble_BestOfFamily_AutoML_20190703_140214,0.577379,0.742217,0.502258,0.252264
GBM_2_AutoML_20190703_140214,0.593774,0.750322,0.501039,0.25104
GBM_1_AutoML_20190703_140214,0.593868,0.752957,0.503496,0.253508
GBM_4_AutoML_20190703_140214,0.598151,0.752089,0.502336,0.252342
GLM_grid_1_AutoML_20190703_140214_model_1,0.598855,0.75415,0.506817,0.256864
GBM_3_AutoML_20190703_140214,0.59952,0.749502,0.500959,0.25096
XGBoost_1_AutoML_20190703_140214,0.600901,0.737306,0.499072,0.249072
DRF_1_AutoML_20190703_140214,0.610473,2.11826,0.528776,0.279604
XGBoost_2_AutoML_20190703_140214,0.614561,0.78781,0.533743,0.284882




In [20]:
# The leader model is stored here
aml.leader

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_AutoML_20190703_140214
No model summary for this model


ModelMetricsMultinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.10087469097584982
RMSE: 0.3176077627764312

ModelMetricsMultinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.25193835746642307
RMSE: 0.5019346147322608




In [21]:
preds = aml.predict(test)
print(preds)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [24]:
var = preds["predict"].cbind(test[y])
print(var)

predict,labels
0,1
0,1
0,0
0,0
0,2
0,2
0,2
0,0
0,2
0,1





In [27]:
# convert to pandas dataframe
y_test = h2o.as_list(test[y], use_pandas=True)
y_pred = h2o.as_list(preds["predict"])
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))

[[1108    0    0]
 [ 187    0    0]
 [ 349    0    0]]
