written by: Nutchaya Phumekham, Aug 2022

### Get CMS dataset size or Block size 
for CRAB data analytics

This notebook saves the active CRAB datasets and datablocks as files. One day per one file. Time range is from 1st of January 2022 to 30th of June 2022 (6 months). The data can be accessed via the following HDFS path:
- hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_month_day.parquet

In [2]:
# !git clone https://github.com/dmwm/CMSSpark.git
import utils as utils
from datetime import datetime, date, timedelta
from pyspark.sql.functions import (
    col,
    lit,
    when,
    sum as _sum,
    count as _count,
    min as _min,
    max as _max,
    first,
    date_format,
    from_unixtime,
    to_date,
    countDistinct
)
import numpy as np
import pandas as pd
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)
import matplotlib.pyplot as plt


spark.conf.set("spark.sql.session.timeZone", "UTC")
from CMSSpark.src.python.CMSSpark import schemas as cms_schemas
from pyspark.sql.window import Window

In [3]:
def _get_schema():
    return StructType(
        [
            StructField(
                "data",
                StructType(
                    [
                        StructField("RecordTime", LongType(), nullable=False),
                        StructField("DESIRED_CMSDataset", StringType(), nullable=True),
                        StructField("GlobalJobId", StringType(), nullable=False),
                        StructField("CMS_SubmissionTool", StringType(), nullable=True),
                        StructField("CRAB_DataBlock", StringType(), nullable=True),
                        StructField("CMSPrimaryDataTier", StringType(), nullable=True),
                        StructField("CRAB_Workflow", StringType(), nullable=True)
                    ]
                ),
            ),
        ]
    )

In [8]:
#note: there is a use of utils.get_candidate_files
def get_active_dataset_datablock_size_df(start_date, end_date):
    raw_df = (
        spark.read.option("basePath", _DEFAULT_HDFS_FOLDER)
        .json(
            utils.get_candidate_files(start_date, end_date, spark, _DEFAULT_HDFS_FOLDER),
            schema=schema,
        ).select("data.*")
        .filter(
            f"""CMS_SubmissionTool == 'CRAB'
          AND CMSPrimaryDataTier != 'Unknown'
          AND CRAB_DataBlock IS NOT NULL
          AND RecordTime >= {start_date.timestamp() * 1000}
          AND RecordTime < {end_date.timestamp() * 1000}
          """
        )
        .drop_duplicates(["GlobalJobId"])
    )
    datasets = raw_df.select(['CMSPrimaryDataTier', 'DESIRED_CMSDataset']).drop_duplicates(['DESIRED_CMSDataset'])
    dateset_size = datasets.join(d_size_df, datasets.DESIRED_CMSDataset==d_size_df.d_dataset)\
                        .select(['CMSPrimaryDataTier', 'DESIRED_CMSDataset', 'Dataset_Size'])
    blocks = raw_df.withColumn("First_Access", _min('RecordTime').over(Window.partitionBy('CRAB_DataBlock')))\
            .withColumn("Last_Access", _max('RecordTime').over(Window.partitionBy('CRAB_DataBlock')))\
            .select(['DESIRED_CMSDataset', 'CRAB_DataBlock', 'CRAB_Workflow', 'First_Access', 'Last_Access'])\
            .drop_duplicates(['CRAB_DataBlock'])
    block_size = blocks.join(b_size_df, blocks.CRAB_DataBlock==b_size_df.b_block_name)\
                        .select(['DESIRED_CMSDataset', 'CRAB_DataBlock', 'Block_Size', 
                                 'CRAB_Workflow', 'First_Access', 'Last_Access'])
    output_df = dateset_size.join(block_size, dateset_size.DESIRED_CMSDataset == block_size.DESIRED_CMSDataset)\
                .select([dateset_size.CMSPrimaryDataTier, dateset_size.DESIRED_CMSDataset, \
                         'Dataset_Size', 'CRAB_DataBlock', 'Block_Size', 'CRAB_Workflow', \
                         'First_Access', 'Last_Access'])\
                .orderBy(col('Dataset_Size').desc())
    return output_df

In [21]:
_DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric"
schema = _get_schema()

In [22]:
d_size_df = utils.d_size_df(spark)
b_size_df = utils.b_size_df(spark)

In [23]:
month = 6

In [24]:
for day in range(1, 30+1):
    if(day==30):
        start_date = datetime(2022, month, day)
        end_date = datetime(2022, month+1, 1)
    else:
        start_date = datetime(2022, month, day)
        end_date = datetime(2022, month, day+1)
    output_df = get_active_dataset_datablock_size_df(start_date, end_date)
    
    output_df.write.parquet("hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_%s_%s.parquet" \
                            % (str(month), str(day)))
    
    print("hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_%s_%s.parquet" \
                            % (str(month), str(day)))

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_1.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_2.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_3.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_4.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_5.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_6.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_7.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_8.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_9.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_10.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_11.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_12.parquet
hdfs://analytix/cms/users/nphumekh/cr

22/08/22 13:02:18 ERROR YarnScheduler: Lost executor 305 on ithdp3107.cern.ch: Container from a bad node: container_e143_1660901733081_3307_01_000354 on host: ithdp3107.cern.ch. Exit status: 137. Diagnostics: [2022-08-22 13:02:18.675]Container killed on request. Exit code is 137
[2022-08-22 13:02:18.730]Container exited with a non-zero exit code 137. 
[2022-08-22 13:02:18.732]Killed by external signal
.
22/08/22 13:02:18 WARN TaskSetManager: Lost task 271.0 in stage 3038.0 (TID 185166) (ithdp3107.cern.ch executor 305): ExecutorLostFailure (executor 305 exited caused by one of the running tasks) Reason: Container from a bad node: container_e143_1660901733081_3307_01_000354 on host: ithdp3107.cern.ch. Exit status: 137. Diagnostics: [2022-08-22 13:02:18.675]Container killed on request. Exit code is 137
[2022-08-22 13:02:18.730]Container exited with a non-zero exit code 137. 
[2022-08-22 13:02:18.732]Killed by external signal
.
22/08/22 13:02:18 WARN YarnSchedulerBackend$YarnSchedulerEndpo

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_17.parquet


22/08/22 13:10:13 ERROR YarnScheduler: Lost executor 335 on ithdp3104.cern.ch: Container from a bad node: container_e143_1660901733081_3307_01_000387 on host: ithdp3104.cern.ch. Exit status: 137. Diagnostics: [2022-08-22 13:10:13.365]Container killed on request. Exit code is 137
[2022-08-22 13:10:13.421]Container exited with a non-zero exit code 137. 
[2022-08-22 13:10:13.422]Killed by external signal
.
22/08/22 13:10:13 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 335 for reason Container from a bad node: container_e143_1660901733081_3307_01_000387 on host: ithdp3104.cern.ch. Exit status: 137. Diagnostics: [2022-08-22 13:10:13.365]Container killed on request. Exit code is 137
[2022-08-22 13:10:13.421]Container exited with a non-zero exit code 137. 
[2022-08-22 13:10:13.422]Killed by external signal
.
22/08/22 13:10:13 WARN TaskSetManager: Lost task 54.0 in stage 3081.0 (TID 187701) (ithdp3104.cern.ch executor 335): ExecutorLostFailure (executor

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_18.parquet


22/08/22 13:20:30 WARN TaskSetManager: Lost task 48.0 in stage 3122.0 (TID 189916) (ithdp5005.cern.ch executor 317): FetchFailed(BlockManagerId(317, ithdp5005.cern.ch, 5106, None), shuffleId=789, mapIndex=159, mapId=159, reduceId=48, message=
org.apache.spark.shuffle.FetchFailedException: Block shuffle_789_159_48 is corrupted due to DISK_ISSUE
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1165)
	at org.apache.spark.storage.BufferReleasingInputStream.tryOrFetchFailedException(ShuffleBlockFetcherIterator.scala:1319)
	at org.apache.spark.storage.BufferReleasingInputStream.read(ShuffleBlockFetcherIterator.scala:1299)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
	at java.io.DataInputStream.readInt(DataInputStream.java:387)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.readSize(UnsafeRowSerializer.scal

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_19.parquet


22/08/22 13:26:27 WARN TaskSetManager: Lost task 5.0 in stage 3162.0 (TID 192063) (ithdp5005.cern.ch executor 365): FetchFailed(BlockManagerId(317, ithdp5005.cern.ch, 7337, None), shuffleId=799, mapIndex=105, mapId=105, reduceId=5, message=
org.apache.spark.shuffle.FetchFailedException: Block shuffle_799_105_5 is corrupted but the cause is unknown
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1165)
	at org.apache.spark.storage.BufferReleasingInputStream.tryOrFetchFailedException(ShuffleBlockFetcherIterator.scala:1319)
	at org.apache.spark.storage.BufferReleasingInputStream.read(ShuffleBlockFetcherIterator.scala:1299)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
	at java.io.DataInputStream.readInt(DataInputStream.java:387)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.readSize(UnsafeRowSerializer.

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_20.parquet


22/08/22 13:32:25 WARN TaskSetManager: Lost task 13.0 in stage 3202.0 (TID 194480) (ithdp5005.cern.ch executor 365): FetchFailed(BlockManagerId(367, ithdp5005.cern.ch, 7337, None), shuffleId=809, mapIndex=393, mapId=393, reduceId=13, message=
org.apache.spark.shuffle.FetchFailedException: Block shuffle_809_393_13 is corrupted but the cause is unknown
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1165)
	at org.apache.spark.storage.BufferReleasingInputStream.tryOrFetchFailedException(ShuffleBlockFetcherIterator.scala:1319)
	at org.apache.spark.storage.BufferReleasingInputStream.read(ShuffleBlockFetcherIterator.scala:1299)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
	at java.io.DataInputStream.read(DataInputStream.java:149)
	at org.sparkproject.guava.io.ByteStream

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_21.parquet


22/08/22 13:39:13 WARN TaskSetManager: Lost task 43.0 in stage 3242.0 (TID 197121) (ithdp3107.cern.ch executor 396): FetchFailed(BlockManagerId(317, ithdp5005.cern.ch, 7337, None), shuffleId=819, mapIndex=170, mapId=170, reduceId=43, message=
org.apache.spark.shuffle.FetchFailedException: Block shuffle_819_170_43 is corrupted but the cause is unknown
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1165)
	at org.apache.spark.storage.BufferReleasingInputStream.tryOrFetchFailedException(ShuffleBlockFetcherIterator.scala:1319)
	at org.apache.spark.storage.BufferReleasingInputStream.read(ShuffleBlockFetcherIterator.scala:1299)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
	at java.io.DataInputStream.readInt(DataInputStream.java:387)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.readSize(UnsafeRowSerializ

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_22.parquet


22/08/22 13:45:21 WARN Client: Exception encountered while connecting to the server 
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error
	at org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)
	at org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:623)
	at org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:414)
	at org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:832)
	at org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:828)
	at java.base/java.security.AccessController.doPrivileged(Native Method)
	at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1878)
	at org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:828)
	at org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_23.parquet


22/08/22 13:54:36 ERROR YarnScheduler: Lost executor 433 on ithdp1110.cern.ch: Container from a bad node: container_e144_1660901733081_3307_01_000014 on host: ithdp1110.cern.ch. Exit status: 154. Diagnostics: [2022-08-22 13:54:35.613]Container exited with a non-zero exit code 154
[2022-08-22 13:54:35.613]Container exited with a non-zero exit code 154
.
22/08/22 13:54:36 WARN TaskSetManager: Lost task 207.0 in stage 3318.0 (TID 200442) (ithdp1110.cern.ch executor 433): ExecutorLostFailure (executor 433 exited caused by one of the running tasks) Reason: Container from a bad node: container_e144_1660901733081_3307_01_000014 on host: ithdp1110.cern.ch. Exit status: 154. Diagnostics: [2022-08-22 13:54:35.613]Container exited with a non-zero exit code 154
[2022-08-22 13:54:35.613]Container exited with a non-zero exit code 154
.
22/08/22 13:54:36 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 433 for reason Container from a bad node: container_e144_16609

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_24.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_25.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_26.parquet


22/08/22 14:11:48 WARN TransportChannelHandler: Exception in connection from /188.185.29.202:45583
java.io.IOException: Connection reset by peer
	at java.base/sun.nio.ch.FileDispatcherImpl.read0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
	at java.base/sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:276)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:233)
	at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:223)
	at java.base/sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:356)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:253)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:350)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:719)
	at io.netty.channel.nio.NioEventLoop.processSelectedK

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_27.parquet
hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_28.parquet


22/08/22 14:21:22 ERROR YarnScheduler: Lost executor 457 on ithdp1113.cern.ch: Container from a bad node: container_e145_1660901733081_3307_02_000004 on host: ithdp1113.cern.ch. Exit status: 154. Diagnostics: [2022-08-22 14:21:21.015]Container exited with a non-zero exit code 154
[2022-08-22 14:21:21.015]Container exited with a non-zero exit code 154
.
22/08/22 14:21:22 WARN TaskSetManager: Lost task 299.0 in stage 3500.0 (TID 210511) (ithdp1113.cern.ch executor 457): ExecutorLostFailure (executor 457 exited caused by one of the running tasks) Reason: Container from a bad node: container_e145_1660901733081_3307_02_000004 on host: ithdp1113.cern.ch. Exit status: 154. Diagnostics: [2022-08-22 14:21:21.015]Container exited with a non-zero exit code 154
[2022-08-22 14:21:21.015]Container exited with a non-zero exit code 154
.
22/08/22 14:21:22 WARN TaskSetManager: Lost task 109.0 in stage 3500.0 (TID 210421) (ithdp1113.cern.ch executor 457): ExecutorLostFailure (executor 457 exited caused 

hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_6_29.parquet


22/08/22 14:32:12 WARN TaskSetManager: Lost task 1.0 in stage 3544.0 (TID 214139) (ithdp1111.cern.ch executor 468): FetchFailed(BlockManagerId(471, ithdp2108.cern.ch, 7337, None), shuffleId=899, mapIndex=69, mapId=69, reduceId=1, message=
org.apache.spark.shuffle.FetchFailedException: Block shuffle_899_69_1 is corrupted but the cause is unknown
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1165)
	at org.apache.spark.storage.BufferReleasingInputStream.tryOrFetchFailedException(ShuffleBlockFetcherIterator.scala:1319)
	at org.apache.spark.storage.BufferReleasingInputStream.read(ShuffleBlockFetcherIterator.scala:1299)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
	at java.io.DataInputStream.readInt(DataInputStream.java:387)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.readSize(UnsafeRowSerializer.sca

Py4JJavaError: An error occurred while calling o10630.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:496)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:251)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:110)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:106)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:106)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:93)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:91)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:128)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:848)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:382)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:355)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:781)
	at jdk.internal.reflect.GeneratedMethodAccessor95.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: ShuffleMapStage 3544 (parquet at <unknown>:0) has failed the maximum allowable number of times: 4. Most recent failure reason:
org.apache.spark.shuffle.FetchFailedException: Block shuffle_899_299_3 is corrupted but the cause is unknown
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1165)
	at org.apache.spark.storage.BufferReleasingInputStream.tryOrFetchFailedException(ShuffleBlockFetcherIterator.scala:1319)
	at org.apache.spark.storage.BufferReleasingInputStream.read(ShuffleBlockFetcherIterator.scala:1299)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
	at java.io.DataInputStream.read(DataInputStream.java:149)
	at org.sparkproject.guava.io.ByteStreams.read(ByteStreams.java:899)
	at org.sparkproject.guava.io.ByteStreams.readFully(ByteStreams.java:733)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.next(UnsafeRowSerializer.scala:127)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.next(UnsafeRowSerializer.scala:110)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:496)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:40)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.sort_addToSorter_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage8.smj_findNextJoinRows_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage8.agg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage8.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$2.hasNext(WholeStageCodegenExec.scala:778)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Stream is corrupted
	at net.jpountz.lz4.LZ4BlockInputStream.refill(LZ4BlockInputStream.java:259)
	at net.jpountz.lz4.LZ4BlockInputStream.read(LZ4BlockInputStream.java:157)
	at org.apache.spark.storage.BufferReleasingInputStream.$anonfun$read$3(ShuffleBlockFetcherIterator.scala:1299)
	at scala.runtime.java8.JFunction0$mcI$sp.apply(JFunction0$mcI$sp.java:23)
	at org.apache.spark.storage.BufferReleasingInputStream.tryOrFetchFailedException(ShuffleBlockFetcherIterator.scala:1310)
	... 35 more

	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1843)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2639)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)


22/08/22 14:32:28 WARN TaskSetManager: Lost task 319.0 in stage 3540.0 (TID 214116) (ithdp1111.cern.ch executor 468): TaskKilled (Stage cancelled)
22/08/22 14:32:28 WARN TaskSetManager: Lost task 237.0 in stage 3540.0 (TID 214108) (ithdp1111.cern.ch executor 468): TaskKilled (Stage cancelled)
22/08/22 14:32:28 WARN TaskSetManager: Lost task 323.0 in stage 3540.0 (TID 214117) (ithdp1111.cern.ch executor 468): TaskKilled (Stage cancelled)
22/08/22 14:32:28 WARN TaskSetManager: Lost task 101.0 in stage 3540.0 (TID 214012) (ithdp1111.cern.ch executor 472): TaskKilled (Stage cancelled)
22/08/22 14:32:28 WARN TaskSetManager: Lost task 157.0 in stage 3540.0 (TID 214036) (ithdp1111.cern.ch executor 472): TaskKilled (Stage cancelled)
22/08/22 14:32:28 WARN TaskSetManager: Lost task 119.0 in stage 3540.0 (TID 214031) (ithdp1111.cern.ch executor 472): TaskKilled (Stage cancelled)
22/08/22 14:32:28 WARN TaskSetManager: Lost task 140.0 in stage 3540.0 (TID 214032) (ithdp1111.cern.ch executor 472): 

In [1]:
!hdfs dfs -ls 'hdfs://analytix/cms/users/nphumekh'

22/08/23 02:10:14 WARN ipc.Client: Exception encountered while connecting to the server 
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error
	at org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)
	at org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)
	at org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)
	at org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)
	at org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)
	at java.base/java.security.AccessController.doPrivileged(Native Method)
	at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)
	at org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)
	at org.apache.hadoop.ipc.Client$Connection.access$3800(Client.

Try reading the saved file

In [4]:
try_df = spark.read.parquet("hdfs://analytix/cms/users/nphumekh/crab_dataset_datablock_size_2022_1_1.parquet")

In [5]:
try_df.printSchema()

root
 |-- CMSPrimaryDataTier: string (nullable = true)
 |-- DESIRED_CMSDataset: string (nullable = true)
 |-- Dataset_Size: double (nullable = true)
 |-- CRAB_DataBlock: string (nullable = true)
 |-- Block_Size: double (nullable = true)
 |-- CRAB_Workflow: string (nullable = true)
 |-- First_Access: long (nullable = true)
 |-- Last_Access: long (nullable = true)



In [6]:
try_df.show(20)

+------------------+--------------------+-------------------+--------------------+-----------------+--------------------+-------------+-------------+
|CMSPrimaryDataTier|  DESIRED_CMSDataset|       Dataset_Size|      CRAB_DataBlock|       Block_Size|       CRAB_Workflow| First_Access|  Last_Access|
+------------------+--------------------+-------------------+--------------------+-----------------+--------------------+-------------+-------------+
|              FEVT|/MinBias_TuneCP5_...|4.05859813189503E14|/MinBias_TuneCP5_...|4.998120004573E12|211230_002539:kbu...|1641012481000|1641056401000|
|               RAW|/SingleMuon/Run20...|3.80660439294442E14|/SingleMuon/Run20...|2.014125582609E12|211209_185647:mkr...|1641037681000|1641037681000|
|           MINIAOD|/ParkingBPH2/Run2...| 1.2872668679638E14|/ParkingBPH2/Run2...|  2.2399017025E11|211210_170625:kta...|1640991602000|1641077281000|
|           MINIAOD|/ParkingBPH5/Run2...|1.28647603163384E14|/ParkingBPH5/Run2...|  4.3339629534E10|