You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{ message }}
This repository has been archived by the owner on Dec 5, 2019. It is now read-only.
It was started on 2017-05-11 02:09 and fails with a parquet job in Spark:
tp, tpt, ss=__main__(sc, sqlContext)
Py4JJavaErrorTraceback (most recent call last)
<ipython-input-2-2d59149e83c7> in <module>()
----> 1 tp, tpt, ss = __main__(sc, sqlContext)
<ipython-input-1-b2b8812385c7> in __main__(sc, sqlContext, day, save)
162
163 if save:
--> 164 save_df(testpilottest_df, "testpilottest", day, partitions=16*5)
165
166 def try_convert(conv_func, value):
<ipython-input-1-b2b8812385c7> in save_df(df, name, date_partition, partitions)
79 path_fmt = "s3a://telemetry-parquet/harter/cliqz_{name}/v1{partition_str}"
80 path = path_fmt.format(name=name, partition_str=partition_str)
---> 81 df.repartition(partitions).write.mode("overwrite").parquet(path)
82
83 def __main__(sc, sqlContext, day=None, save=True):
/usr/lib/spark/python/pyspark/sql/readwriter.py in parquet(self, path, mode, partitionBy, compression)
639 self.partitionBy(partitionBy)
640 self._set_opts(compression=compression)
--> 641 self._jwrite.parquet(path)
642
643 @since(1.6)
/usr/lib/spark/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/lib/spark/python/lib/py4j-0.10.3-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling o161.parquet.
: org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelationCommand.scala:149)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply(InsertIntoHadoopFsRelationCommand.scala:115)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply(InsertIntoHadoopFsRelationCommand.scala:115)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:115)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:136)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:133)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:114)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:86)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:86)
at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:525)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:211)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:194)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:488)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.FileNotFoundException: No such file or directory: s3a://telemetry-parquet/harter/cliqz_testpilottest/v1/submission=20170510/_temporary/0/task_201705110015_0010_m_000049
at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:1004)
at org.apache.hadoop.fs.s3a.S3AFileSystem.listStatus(S3AFileSystem.java:745)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.mergePaths(FileOutputCommitter.java:426)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:362)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:334)
at org.apache.parquet.hadoop.ParquetOutputCommitter.commitJob(ParquetOutputCommitter.java:46)
at org.apache.spark.sql.execution.datasources.BaseWriterContainer.commitJob(WriterContainer.scala:222)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelationCommand.scala:144)
... 30 more
```python
tp.take(2)
NameErrorTraceback (most recent call last)
<ipython-input-3-24660d37e8c0> in <module>()
----> 1 tp.take(2)
NameError: name 'tp' is not defined
tpt.take(2)
NameErrorTraceback (most recent call last)
<ipython-input-4-dcb847e2e210> in <module>()
----> 1 tpt.take(2)
NameError: name 'tpt' is not defined
ss.take(2)
NameErrorTraceback (most recent call last)
<ipython-input-5-45f94e4c60d4> in <module>()
----> 1 ss.take(2)
NameError: name 'ss' is not defined
The text was updated successfully, but these errors were encountered:
@harterrt Closing since this was a transient error during the deploy of the new Spark job scheduling system and the job has had successful runs since then.
It was started on 2017-05-11 02:09 and fails with a parquet job in Spark:
The text was updated successfully, but these errors were encountered: