In [1]:
import os
import sys
import json
import codecs

import time
from datetime import datetime

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext, SQLContext
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark import SparkFiles
from pyspark.sql.functions import col, lit, length, row_number, when

In [3]:
from lib import spark_utils

In [4]:
spark = spark_utils.get_spark('test222')

25/03/28 17:09:01 WARN Utils: Your hostname, Mac-MD2XX1D4WV.local resolves to a loopback address: 127.0.0.1; using 192.168.11.215 instead (on interface en0)
25/03/28 17:09:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/28 17:09:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
spark

In [13]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000000)
pd.set_option('display.width', 4000)

In [14]:
import os
import glob
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lit, length, row_number, when
from pyspark.sql.functions import avg, count, sum
from pyspark.sql.functions import upper, lower
from pyspark.sql.functions import substring, split
from pyspark.sql import functions
from pyspark.sql.types import IntegerType, StringType, LongType
from pyspark.sql.types import StructType, StructField

In [21]:
def cast_to_int_with_default(value):
    try:
        if value is None:
            return 0  # or any other default value you prefer
        else:
            return int(value)
    except ValueError:
        return 0

from pyspark.sql.functions import udf

udf_cast_to_int_with_default = udf(cast_to_int_with_default, IntegerType())

In [28]:
def get_joined_data_with_qt(spark, input_path, _year, _qurt):
    coverpage_ = spark.read.option(
        'delimiter', '\t').option(
        'header', True).csv(
        input_path % (_year, _qurt, 'COVERPAGE.tsv'))

    coverpage_ = coverpage_.withColumns({'YEAR': lit(_year), 'QUARTER': lit(_qurt)})

    infotable_ = spark.read.option(
        'delimiter', '\t').option(
        'header', True).csv(
        input_path % (_year, _qurt, 'INFOTABLE.tsv'))

    # cusup 合法性检查
    infotable_ = infotable_.withColumn('CUSIP', lower(col('CUSIP')))
    infotable_ = infotable_.filter((infotable_['SSHPRNAMTTYPE'] == 'SH') &
                                   (infotable_['PUTCALL'].isNull()) &
                                   (infotable_['CUSIP'] != '000000000') &
                                   (infotable_['CUSIP'] != '0000000na'))
    infotable_ = infotable_.filter(length(infotable_['CUSIP']) == 9)

    # 仅选取普通股
    infotable_ = infotable_.filter(upper(col("TITLEOFCLASS")).contains("COM"))
    infotable_ = infotable_.withColumns({
        'YEAR': lit(_year), 'QUARTER': lit(_qurt)})

    summarypage_ = spark.read.option('delimiter', '\t').option(
        'header', True).csv(input_path % (_year, _qurt, 'SUMMARYPAGE.tsv'))

    summarypage_ = summarypage_.withColumns({
        'YEAR': lit(_year),
        'QUARTER': lit(_qurt)})

    submission_ = spark.read.option(
        'delimiter', '\t').option(
        'header', True).csv(input_path % (_year, _qurt, 'SUBMISSION.tsv'))
    submission_ = submission_.withColumns({'YEAR': lit(_year), 'QUARTER': lit(_qurt)})

    joined_ = infotable_.join(
        coverpage_, (infotable_['ACCESSION_NUMBER'] == coverpage_['ACCESSION_NUMBER']) &
        (infotable_['YEAR'] == coverpage_['YEAR']) & (infotable_['QUARTER'] == coverpage_['QUARTER']),
        'left'
    ).join(
        summarypage_, (infotable_['ACCESSION_NUMBER'] == summarypage_['ACCESSION_NUMBER']) &
        (infotable_['YEAR'] == summarypage_['YEAR']) & (infotable_['QUARTER'] == summarypage_['QUARTER']),
        'left'
    ).join(
        submission_, (infotable_['ACCESSION_NUMBER'] == submission_['ACCESSION_NUMBER']) &
        (infotable_['YEAR'] == submission_['YEAR']) & (infotable_['QUARTER'] == submission_['QUARTER']),
        'left'
    ).drop(
        coverpage_['YEAR'], coverpage_['QUARTER'], coverpage_['ACCESSION_NUMBER'],
        summarypage_['YEAR'], summarypage_['QUARTER'], summarypage_['ACCESSION_NUMBER'],
        submission_['YEAR'], submission_['QUARTER'], submission_['ACCESSION_NUMBER'])

    joined_ = joined_.withColumns({
        'NAMEOFISSUER': upper(col('NAMEOFISSUER')),
        'FILINGMANAGER_NAME': upper(col('FILINGMANAGER_NAME')),
        'VALUE': udf_cast_to_int_with_default(col('VALUE')),
    })

    # 需要有一个去重判断,投资机构可能多次上传,13HR,13HR-A,...
    window_spec = Window.partitionBy(
        'CUSIP', 'FILINGMANAGER_NAME'
    ).orderBy(col('FILING_DATE').desc(), col('VALUE').desc())

    joined_ = joined_.withColumn('row_number', row_number().over(window_spec))
    joined_ = joined_.filter(col('row_number') == 1).drop('row_number')

    return joined_




In [29]:
def transfer_standard_unit(filter_data, year):
    filter_data = filter_data.withColumns({
        'VALUE': udf_cast_to_int_with_default(col('VALUE')),
        'SSHPRNAMT': udf_cast_to_int_with_default(col('SSHPRNAMT')),
    })
    # 2022年及以前的VALUE单位为千$，2023年及之后为$(实际从1月3号开始)
    filter_data = filter_data.withColumn(
        'VALUE', when(lit(year) >= 2023, col('VALUE')).otherwise(col('VALUE')*1000))
    return filter_data

In [30]:
def filter_data_by_share_value(filter_data, year):
    # 对空值或负值填0
    filter_data = filter_data.withColumns({
        "VALUE": when(col("VALUE").isNull() | (col("VALUE") < 0), 0).otherwise(col("VALUE")),
        "SSHPRNAMT": when(col("SSHPRNAMT").isNull() | (col("SSHPRNAMT") < 0), 0).otherwise(col("SSHPRNAMT")),
    })
    # 以股票为Key统计总被交易价值、效果量
    ticker_value = filter_data.groupby(['CUSIP']).agg(
        sum(col('VALUE')).alias('VALUE'),
        sum(col('SSHPRNAMT')).alias('SSHPRNAMT')
    ).withColumn(
        'VPSSH', col('VALUE') / col('SSHPRNAMT')
    ).filter(
        (col('VPSSH') > 0.00000000000001) & (col('VPSSH') < 100000000000.)
    )

    ticker_value = ticker_value.withColumnRenamed(
        'CUSIP', 'CUSIP_ticker'
    ).withColumnRenamed(
        'VALUE', 'VALUE_ticker'
    ).withColumnRenamed(
        'SSHPRNAMT', 'SSHPRNAMT_ticker'
    )

    # 如果基金持有成本明显偏离总平均成本，认为是脏数据，过滤掉
    filter_data = filter_data.join(
        ticker_value, filter_data['CUSIP'] == ticker_value['CUSIP_ticker'], 'left'
    ).drop('CUSIP_ticker', 'VALUE_ticker', 'SSHPRNAMT_ticker')

    filter_data = filter_data.filter(
        (col('VALUE') / col('SSHPRNAMT') > 0.1 * col('VPSSH'))
        & (col('VALUE') / col('SSHPRNAMT') < 10 * col('VPSSH')))

    filter_data = filter_data.drop('VPSSH')

    return filter_data



In [45]:
input_path = '/Users/liuda/Library/CloudStorage/Dropbox/shareit/code/trading/data/hedge/%sq%s_form13f/%s'

In [147]:
from pyspark.sql.functions import trim

In [176]:
cur_year='2024'
cur_qurt='3'

joined_cur_3 = get_joined_data_with_qt(spark, input_path, cur_year, cur_qurt)
joined_cur_3 = transfer_standard_unit(joined_cur_3, cur_year)
joined_cur_3 = filter_data_by_share_value(joined_cur_3, cur_year)

joined_cur_3 = joined_cur_3.withColumn('FILINGMANAGER_NAME', trim(lower(col('FILINGMANAGER_NAME'))))

joined_cur_3 = joined_cur_3.dropna(subset=['CUSIP', 'FILINGMANAGER_NAME'])

In [177]:
pre_year ='2024'
pre_qurt ='2'

joined_cur_2 = get_joined_data_with_qt(spark, input_path, pre_year, pre_qurt)
joined_cur_2 = transfer_standard_unit(joined_cur_2, pre_year)
joined_cur_2 = filter_data_by_share_value(joined_cur_2, pre_year)

joined_cur_2 = joined_cur_2.withColumn('FILINGMANAGER_NAME', trim(lower(col('FILINGMANAGER_NAME'))))

joined_cur_2 = joined_cur_2.dropna(subset=['CUSIP', 'FILINGMANAGER_NAME'])

In [178]:
joined_cur_3.persist()
joined_cur_2.persist()

25/03/19 16:48:21 WARN CacheManager: Asked to cache already cached data.
25/03/19 16:48:21 WARN CacheManager: Asked to cache already cached data.


DataFrame[ACCESSION_NUMBER: string, INFOTABLE_SK: string, NAMEOFISSUER: string, TITLEOFCLASS: string, CUSIP: string, FIGI: string, VALUE: int, SSHPRNAMT: int, SSHPRNAMTTYPE: string, PUTCALL: string, INVESTMENTDISCRETION: string, OTHERMANAGER: string, VOTING_AUTH_SOLE: string, VOTING_AUTH_SHARED: string, VOTING_AUTH_NONE: string, YEAR: string, QUARTER: string, REPORTCALENDARORQUARTER: string, ISAMENDMENT: string, AMENDMENTNO: string, AMENDMENTTYPE: string, CONFDENIEDEXPIRED: string, DATEDENIEDEXPIRED: string, DATEREPORTED: string, REASONFORNONCONFIDENTIALITY: string, FILINGMANAGER_NAME: string, FILINGMANAGER_STREET1: string, FILINGMANAGER_STREET2: string, FILINGMANAGER_CITY: string, FILINGMANAGER_STATEORCOUNTRY: string, FILINGMANAGER_ZIPCODE: string, REPORTTYPE: string, FORM13FFILENUMBER: string, CRDNUMBER: string, SECFILENUMBER: string, PROVIDEINFOFORINSTRUCTION5: string, ADDITIONALINFORMATION: string, OTHERINCLUDEDMANAGERSCOUNT: string, TABLEENTRYTOTAL: string, TABLEVALUETOTAL: string

In [179]:
joined_cur_3.filter(col('CUSIP')=='874039100').groupby(['YEAR', 'QUARTER','CUSIP']).agg(sum('VALUE')).show()
joined_cur_2.filter(col('CUSIP')=='874039100').groupby(['YEAR', 'QUARTER','CUSIP']).agg(sum('VALUE')).show()

+----+-------+---------+----------+
|YEAR|QUARTER|    CUSIP|sum(VALUE)|
+----+-------+---------+----------+
|2024|      3|874039100|2978303968|
+----+-------+---------+----------+

+----+-------+---------+----------+
|YEAR|QUARTER|    CUSIP|sum(VALUE)|
+----+-------+---------+----------+
|2024|      2|874039100|1752165162|
+----+-------+---------+----------+



In [187]:
joined_pre = joined_cur_2.select(['NAMEOFISSUER', 'FILINGMANAGER_NAME', 'CUSIP',
                                'YEAR', 'QUARTER', 'VALUE',
                                'SSHPRNAMTTYPE', 'SSHPRNAMT',
                                'FILING_DATE'])
joined_pre.filter(col('CUSIP')=='874039100').groupby(['YEAR', 'QUARTER','CUSIP']).agg(sum('VALUE')).show()

joined_pre = joined_pre.withColumnRenamed(
        "NAMEOFISSUER", "PRENAMEOFISSUER").withColumnRenamed(
        "FILINGMANAGER_NAME", "PREFILINGMANAGER_NAME").withColumnRenamed(
        "CUSIP", "PRECUSIP").withColumnRenamed(
        "VALUE", "PREVALUE").withColumnRenamed(
        'YEAR', 'PREYEAR').withColumnRenamed(
        'QUARTER', 'PREQUARTER').withColumnRenamed(
        'SSHPRNAMTTYPE', 'PRESSHPRNAMTTYPE').withColumnRenamed(
        'SSHPRNAMT', 'PRESSHPRNAMT').withColumnRenamed(
        'FILING_DATE', 'PRE_FILING_DATE')

# 至此，能保证多次跑数据joined_cur和joined_pre不存在随机性，数据完全一致
joined_pre.filter(col('PRECUSIP')=='874039100').groupby(['PREYEAR', 'PREQUARTER','PRECUSIP']).agg(sum('PREVALUE')).show()
print("joined_pre.filter(col('PRECUSIP')=='874039100').groupby(['PREYEAR', 'PREQUARTER','PRECUSIP']).agg(sum('PREVALUE')).show()")

joined_cur_3_tmp = joined_cur_3.join(
    joined_pre, (joined_cur_3['FILINGMANAGER_NAME'] == joined_pre['PREFILINGMANAGER_NAME']) &
    (joined_cur_3['CUSIP'] == joined_pre['PRECUSIP']),
    'outer'  # 这里因为要求PREVALUE，所以应该为outer
)

# joined_cur_3_tmp.limit(3).toPandas()
joined_cur_3_tmp = joined_cur_3_tmp.withColumns({
    "ADDITIONALINFORMATION": substring("ADDITIONALINFORMATION", 0, 2000),
    "FILINGMANAGER_NAME": when(col("FILINGMANAGER_NAME").isNotNull(), col("FILINGMANAGER_NAME")).otherwise(col("PREFILINGMANAGER_NAME")),
    "CUSIP": when(col("CUSIP").isNotNull(), col("CUSIP")).otherwise(col("PRECUSIP")),
    "YEAR": when(col("YEAR").isNotNull(), col("YEAR")).otherwise(lit(cur_year)),
    "QUARTER": when(col("QUARTER").isNotNull(), col("QUARTER")).otherwise(lit(cur_qurt)),
    "PREYEAR": when(col("PREYEAR").isNotNull(), col("PREYEAR")).otherwise(lit(pre_year)),
    "PREQUARTER": when(col("PREQUARTER").isNotNull(), col("PREQUARTER")).otherwise(lit(pre_qurt)),
    "VALUE": when(col("VALUE").isNull() | (col("VALUE") < 0), 0).otherwise(col("VALUE")),
    "PREVALUE": when(col("PREVALUE").isNull() | (col("PREVALUE") < 0), 0).otherwise(col("PREVALUE")),
    "SSHPRNAMT": when(col("SSHPRNAMT").isNull() | (col("SSHPRNAMT") < 0), 0).otherwise(col("SSHPRNAMT")),
    "PRESSHPRNAMT": when(col("PRESSHPRNAMT").isNull() | (col("PRESSHPRNAMT") < 0), 0).otherwise(col("PRESSHPRNAMT")),
}).drop("PRECUSIP", "PRENAMEOFISSUER")

joined_cur_3_tmp.filter(col('CUSIP')=='874039100').groupby(['YEAR', 'QUARTER','CUSIP']).agg(sum('VALUE'), sum('PREVALUE')).show()

+----+-------+---------+----------+
|YEAR|QUARTER|    CUSIP|sum(VALUE)|
+----+-------+---------+----------+
|2024|      2|874039100|1752165162|
+----+-------+---------+----------+

+-------+----------+---------+-------------+
|PREYEAR|PREQUARTER| PRECUSIP|sum(PREVALUE)|
+-------+----------+---------+-------------+
|   2024|         2|874039100|   1752165162|
+-------+----------+---------+-------------+

joined_pre.filter(col('PRECUSIP')=='874039100').groupby(['PREYEAR', 'PREQUARTER','PRECUSIP']).agg(sum('PREVALUE')).show()


25/03/19 16:58:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/19 16:58:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/19 16:58:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/19 16:58:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/19 16:58:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/19 16:58:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/19 16:58:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/19 16:58:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+----+-------+---------+----------+-------------+
|YEAR|QUARTER|    CUSIP|sum(VALUE)|sum(PREVALUE)|
+----+-------+---------+----------+-------------+
|2024|      3|874039100|2978303968|   1752165162|
+----+-------+---------+----------+-------------+



                                                                                

In [181]:
joined_cur_3_tmp.printSchema()

root
 |-- ACCESSION_NUMBER: string (nullable = true)
 |-- INFOTABLE_SK: string (nullable = true)
 |-- NAMEOFISSUER: string (nullable = true)
 |-- TITLEOFCLASS: string (nullable = true)
 |-- CUSIP: string (nullable = true)
 |-- FIGI: string (nullable = true)
 |-- VALUE: integer (nullable = true)
 |-- SSHPRNAMT: integer (nullable = true)
 |-- SSHPRNAMTTYPE: string (nullable = true)
 |-- PUTCALL: string (nullable = true)
 |-- INVESTMENTDISCRETION: string (nullable = true)
 |-- OTHERMANAGER: string (nullable = true)
 |-- VOTING_AUTH_SOLE: string (nullable = true)
 |-- VOTING_AUTH_SHARED: string (nullable = true)
 |-- VOTING_AUTH_NONE: string (nullable = true)
 |-- YEAR: string (nullable = true)
 |-- QUARTER: string (nullable = true)
 |-- REPORTCALENDARORQUARTER: string (nullable = true)
 |-- ISAMENDMENT: string (nullable = true)
 |-- AMENDMENTNO: string (nullable = true)
 |-- AMENDMENTTYPE: string (nullable = true)
 |-- CONFDENIEDEXPIRED: string (nullable = true)
 |-- DATEDENIEDEXPIRED: st

In [6]:
df = spark.read.option(
        'delimiter', '\t').option(
        'header', True).csv('/Users/liuda/Local/data/trading/data/hedge/01mar2024-31may2024_form13f/INFOTABLE.tsv')


In [None]:
df = df.withColumns({'CIK': F.split(col('ACCESSION_NUMBER'), '-')[0]})


In [10]:
df.printSchema()

root
 |-- ACCESSION_NUMBER: string (nullable = true)
 |-- INFOTABLE_SK: string (nullable = true)
 |-- NAMEOFISSUER: string (nullable = true)
 |-- TITLEOFCLASS: string (nullable = true)
 |-- CUSIP: string (nullable = true)
 |-- FIGI: string (nullable = true)
 |-- VALUE: string (nullable = true)
 |-- SSHPRNAMT: string (nullable = true)
 |-- SSHPRNAMTTYPE: string (nullable = true)
 |-- PUTCALL: string (nullable = true)
 |-- INVESTMENTDISCRETION: string (nullable = true)
 |-- OTHERMANAGER: string (nullable = true)
 |-- VOTING_AUTH_SOLE: string (nullable = true)
 |-- VOTING_AUTH_SHARED: string (nullable = true)
 |-- VOTING_AUTH_NONE: string (nullable = true)
 |-- CIK: string (nullable = true)



In [11]:
coverage = spark.read.option(
        'delimiter', '\t').option(
        'header', True).csv('/Users/liuda/Local/data/trading/data/hedge/01mar2024-31may2024_form13f/COVERPAGE.tsv')

In [15]:
join_df = df.join(
coverage, (df['ACCESSION_NUMBER'] == coverage['ACCESSION_NUMBER']),
'left'
).drop(coverage['ACCESSION_NUMBER'])

In [18]:
df_cols = join_df.select(['CIK', 'ACCESSION_NUMBER', 'FILINGMANAGER_NAME'])

In [31]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000000)
pd.set_option('display.width', 4000)

In [37]:
df_cols1 = df_cols.groupby(['CIK']).agg(
	F.collect_set('FILINGMANAGER_NAME').alias('NAME_set'),
).withColumn('NAME_count', F.size(F.col('NAME_set')))
df_cols1.filter(col('NAME_count')>1).limit(10).toPandas().to_dict(orient='records')

                                                                                

[{'CIK': '0000038777',
  'NAME_set': ['ClearBridge Investments Ltd',
   'MARTIN CURRIE LTD',
   'FRANKLIN RESOURCES INC'],
  'NAME_count': 3},
 {'CIK': '0000080255',
  'NAME_set': ['PRICE T ROWE ASSOCIATES INC /MD/',
   'T. Rowe Price Investment Management, Inc.'],
  'NAME_count': 2},
 {'CIK': '0000318989', 'NAME_set': ['FIL Ltd', 'FMR LLC'], 'NAME_count': 2},
 {'CIK': '0000732812',
  'NAME_set': ['Capital World Investors',
   'Capital International, Inc./CA/',
   'Capital Group Private Client Services, Inc.',
   'CAPITAL INTERNATIONAL LTD /CA/',
   'CAPITAL GROUP INVESTMENT MANAGEMENT PTE. LTD.',
   'Capital Research Global Investors',
   'CAPITAL INTERNATIONAL SARL',
   'Capital International Investors'],
  'NAME_count': 8},
 {'CIK': '0000825293',
  'NAME_set': ['Wellington Shields & Co., LLC',
   'Wellington Shields Capital Management, LLC',
   'CAPITAL MANAGEMENT ASSOCIATES /NY/'],
  'NAME_count': 3},
 {'CIK': '0000892712',
  'NAME_set': ['Spectrum Investment Advisors, Inc.',
   'M

In [38]:
df_cols2 = df_cols.groupby(['FILINGMANAGER_NAME']).agg(
	F.collect_set('CIK').alias('CIK_set'),
).withColumn('CIK_count', F.size(F.col('CIK_set')))

In [39]:
df_cols2.filter(col('CIK_count')>1).limit(10).toPandas().to_dict(orient='records')

                                                                                

[{'FILINGMANAGER_NAME': 'A.P. Gilfoyle & Co., L.P.',
  'CIK_set': ['0002008868', '0002014045'],
  'CIK_count': 2},
 {'FILINGMANAGER_NAME': 'ABLES, IANNONE, MOORE & ASSOCIATES, INC.',
  'CIK_set': ['0001398344', '0001213900'],
  'CIK_count': 2},
 {'FILINGMANAGER_NAME': 'Empowered Funds, LLC',
  'CIK_set': ['0001592828', '0001572838'],
  'CIK_count': 2},
 {'FILINGMANAGER_NAME': 'Federation des caisses Desjardins du Quebec',
  'CIK_set': ['0002022299', '0002022297'],
  'CIK_count': 2},
 {'FILINGMANAGER_NAME': 'LRI Investments, LLC',
  'CIK_set': ['0002023325', '0001965796'],
  'CIK_count': 2},
 {'FILINGMANAGER_NAME': 'MetLife Investment Management, LLC',
  'CIK_set': ['0001628280', '0000905148'],
  'CIK_count': 2},
 {'FILINGMANAGER_NAME': 'SONA ASSET MANAGEMENT (US) LLC',
  'CIK_set': ['0001856405', '0001894188'],
  'CIK_count': 2},
 {'FILINGMANAGER_NAME': 'Thrive Capital Management, LLC',
  'CIK_set': ['0001845943', '0001012975'],
  'CIK_count': 2},
 {'FILINGMANAGER_NAME': 'Transcendent 

25/03/28 18:54:17 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 951416 ms exceeds timeout 120000 ms
25/03/28 18:54:17 WARN SparkContext: Killing executors is not supported by current scheduler.
25/03/28 18:54:18 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$