# Word embeddings based on small error messages
## 1. Load data

In [1]:
#df_new = spark.read.csv('hdfs:///cms/users/llayer/df_errors_11102017_01042019.csv', header=True)
df1 = spark.read.csv('hdfs:///cms/users/llayer/df_reduced_codes.csv', header=True)
df2 = spark.read.csv('hdfs:///cms/users/llayer/df_reduced_codes2.csv', header=True)
#df2 = spark.read.csv('hdfs:///cms/users/llayer/df__exitcodes2.csv', header=True)
#df2 = spark.read.csv('hdfs:///cms/users/llayer/df_errors_small2.csv', header=True)
#df_old = spark.read.csv('hdfs:///cms/users/llayer/df_errors_01052017_09102017.csv', header=True)
#df_old2 = spark.read.csv('hdfs:///cms/users/llayer/df_errors_01012017_01052017.csv', header=True)

In [2]:
df = df1.union(df2)

In [3]:
print df.rdd.getNumPartitions()

200


## 2. Hash vectorizer + IDF

In [8]:
# Create the tokens

from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import Word2Vec

tokenizer = Tokenizer(inputCol="error_msg", outputCol="words")
tokenized = tokenizer.transform(df)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(tokenized)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

print rescaledData.select('features').show()

+--------------------+
|            features|
+--------------------+
|(20,[0,1,2,3,4,5,...|
|(20,[0,1,2,3,4,6,...|
|(20,[0,1,2,3,4,5,...|
|(20,[0,1,2,3,4,5,...|
|(20,[0,1,2,3,4,5,...|
|(20,[0,1,2,3,4,6,...|
|(20,[0,1,3,5,7,8,...|
|(20,[0,1,2,3,4,5,...|
|(20,[0,1,2,5,7,9,...|
|(20,[0,1,3,4,5,6,...|
|(20,[0,1,3,4,5,6,...|
|(20,[0,3,4,5,6,7,...|
|(20,[0,1,2,3,4,5,...|
|(20,[0,1,2,3,4,5,...|
|(20,[0,1,2,3,4,5,...|
|(20,[0,1,2,3,4,5,...|
|(20,[0,1,2,3,4,5,...|
|(20,[0,2,3,4,5,7,...|
|(20,[0,1,2,3,4,5,...|
|(20,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows

None


In [9]:
from pyspark.ml.feature import CountVectorizer

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features")

model = cv.fit(tokenized)

In [10]:
result = model.transform(tokenized)
result.show()

+--------------------+-------------------+-----+--------------------+----------+------+------+--------------------+--------------------+
|           task_name|               site|error|           error_msg|side_state|action|memory|               words|            features|
+--------------------+-------------------+-----+--------------------+----------+------+------+--------------------+--------------------+
|/fabozzi_Run2016D...|    T1_US_FNAL_Disk|   85|Adding last 25 li...|  bad_site|  acdc|  null|[adding, last, 25...|(3,[0,2],[141.0,7...|
|/fabozzi_Run2017B...|         T1_RU_JINR|   92|Adding last 25 li...| good_site|  acdc|  null|[adding, last, 25...|(3,[0,2],[66.0,7.0])|
|/fabozzi_Run2017F...|T2_UK_London_Brunel|99109|Error in StageOut...| good_site|  acdc|  null|[error, in, stage...|(3,[0,2],[639.0,2...|
|/fabozzi_Run2017F...|          T2_US_MIT| 8004|An exception of c...| good_site|  acdc|180000|[an, exception, o...|     (3,[0],[139.0])|
|/fabozzi_Run2017H...|     T2_US_Nebraska

In [11]:
from pyspark.ml.feature import IDF

idf = IDF(inputCol="features", outputCol="idf")
idfModel = idf.fit(result)
rescaledData = idfModel.transform(result)

rescaledData.select('idf', 'words').show()

+--------------------+--------------------+
|                 idf|               words|
+--------------------+--------------------+
|(3,[0,2],[36.4053...|[adding, last, 25...|
|(3,[0,2],[17.0408...|[adding, last, 25...|
|(3,[0,2],[164.986...|[error, in, stage...|
|(3,[0],[35.888978...|[an, exception, o...|
|(3,[0,2],[20.1391...|[adding, last, 25...|
|(3,[0,2],[19.8809...|[adding, last, 25...|
|           (3,[],[])|[error, in, cmssw...|
|(3,[0,2],[21.9464...|[adding, last, 25...|
|           (3,[],[])|[error, in, cmssw...|
|           (3,[],[])|[error, in, cmssw...|
|           (3,[],[])|[error, in, cmssw...|
|           (3,[],[])|[error, in, cmssw...|
|(3,[0,2],[42.3438...|[adding, last, 25...|
|(3,[0,2],[22.9792...|[adding, last, 25...|
|(3,[0,2],[5.93846...|[adding, last, 25...|
|(3,[0,2],[14.2006...|[adding, last, 25...|
|(3,[0,1],[47.5077...|[could, not, find...|
|(3,[0,1],[47.5077...|[could, not, find...|
|(3,[0,2],[19.8809...|[adding, last, 25...|
|(3,[0,2],[35.8889...|[adding, l

## 3. Word2Vec

In [7]:

from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import Word2Vec

tokenizer = Tokenizer(inputCol="error_msg", outputCol="words")
tokenized = tokenizer.transform(df)
word2vec = Word2Vec(inputCol="words", outputCol="w2v", vectorSize = 10)
model = word2vec.fit(tokenized)

In [8]:
result = model.transform(tokenized).coalesce(200).persist()
print result.select(['task_name', 'site', 'w2v']).show()

+--------------------+-------------------+--------------------+
|           task_name|               site|                 w2v|
+--------------------+-------------------+--------------------+
|/fabozzi_Run2016D...|    T1_US_FNAL_Disk|[-0.6767393925434...|
|/fabozzi_Run2017B...|         T1_RU_JINR|[-0.6808537950240...|
|/fabozzi_Run2017F...|T2_UK_London_Brunel|[-0.1858461705453...|
|/fabozzi_Run2017F...|          T2_US_MIT|[-0.4759413566384...|
|/fabozzi_Run2017H...|     T2_US_Nebraska|[-0.7338168016692...|
|/mcremone_ACDC0_t...|    T2_UK_London_IC|[-0.6695511883629...|
|/mcremone_task_HI...|     T2_CH_CERN_HLT|[0.03431056173784...|
|/mcremone_task_HI...|         T2_CH_CERN|[-0.6576490365118...|
|/pdmvserv_task_B2...|         T1_IT_CNAF|[-0.0120665904666...|
|/pdmvserv_task_B2...|         T2_US_UCSD|[-0.1310454917450...|
|/pdmvserv_task_B2...|    T2_UK_London_IC|[-0.1272198777231...|
|/pdmvserv_task_B2...|     T2_US_Nebraska|[-0.1811169521124...|
|/pdmvserv_task_B2...|T2_UK_London_Brune

In [9]:
df_pandas = result.select(['task_name', 'error', 'site','w2v']).toPandas()

In [10]:
df_pandas.to_csv('/eos/user/l/llayer/AIErrorHandling/df_word2vec_exitcodes.csv', index=False)