In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql import types

import os
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

Configuration

In [2]:
load_dotenv()
credentials_location = os.getenv('GCP_CREDENTIALS_LOCATION')
gcp_bucket_name = os.getenv('GCP_BUCKET')

In [3]:
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "/Users/Manu/lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

Context

In [4]:
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", 'true')

22/04/13 13:49:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Session

In [5]:
spark = SparkSession.builder \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.11:0.12.2", conf=sc.getConf()) \
    .getOrCreate()

Reading data from google cloug storage

In [6]:
cot = spark.read \
    .text(f'gs://{gcp_bucket_name}/raw/*')

In [7]:
cot.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+
|               value|
+--------------------+
|"Market_and_Excha...|
|"CANADIAN DOLLAR ...|
|"CANADIAN DOLLAR ...|
|"CANADIAN DOLLAR ...|
|"CANADIAN DOLLAR ...|
+--------------------+
only showing top 5 rows



                                                                                

In [8]:
cot.count()

                                                                                

24140

In [8]:
cot.collect()[:2]

                                                                                

[Row(value='"Market_and_Exchange_Names","As_of_Date_In_Form_YYMMDD","Report_Date_as_YYYY-MM-DD","CFTC_Contract_Market_Code","CFTC_Market_Code","CFTC_Region_Code","CFTC_Commodity_Code","Open_Interest_All","Dealer_Positions_Long_All","Dealer_Positions_Short_All","Dealer_Positions_Spread_All","Asset_Mgr_Positions_Long_All","Asset_Mgr_Positions_Short_All","Asset_Mgr_Positions_Spread_All","Lev_Money_Positions_Long_All","Lev_Money_Positions_Short_All","Lev_Money_Positions_Spread_All","Other_Rept_Positions_Long_All","Other_Rept_Positions_Short_All","Other_Rept_Positions_Spread_All","Tot_Rept_Positions_Long_All","Tot_Rept_Positions_Short_All","NonRept_Positions_Long_All","NonRept_Positions_Short_All","Change_in_Open_Interest_All","Change_in_Dealer_Long_All","Change_in_Dealer_Short_All","Change_in_Dealer_Spread_All","Change_in_Asset_Mgr_Long_All","Change_in_Asset_Mgr_Short_All","Change_in_Asset_Mgr_Spread_All","Change_in_Lev_Money_Long_All","Change_in_Lev_Money_Short_All","Change_in_Lev_Money_S

1. All colums have been combined into one column called `value`
2. Need to split the column

Extract the column names from the row object

In [7]:
col_row = cot.collect()[0].asDict()['value']
print(col_row)

                                                                                

"Market_and_Exchange_Names","As_of_Date_In_Form_YYMMDD","Report_Date_as_YYYY-MM-DD","CFTC_Contract_Market_Code","CFTC_Market_Code","CFTC_Region_Code","CFTC_Commodity_Code","Open_Interest_All","Dealer_Positions_Long_All","Dealer_Positions_Short_All","Dealer_Positions_Spread_All","Asset_Mgr_Positions_Long_All","Asset_Mgr_Positions_Short_All","Asset_Mgr_Positions_Spread_All","Lev_Money_Positions_Long_All","Lev_Money_Positions_Short_All","Lev_Money_Positions_Spread_All","Other_Rept_Positions_Long_All","Other_Rept_Positions_Short_All","Other_Rept_Positions_Spread_All","Tot_Rept_Positions_Long_All","Tot_Rept_Positions_Short_All","NonRept_Positions_Long_All","NonRept_Positions_Short_All","Change_in_Open_Interest_All","Change_in_Dealer_Long_All","Change_in_Dealer_Short_All","Change_in_Dealer_Spread_All","Change_in_Asset_Mgr_Long_All","Change_in_Asset_Mgr_Short_All","Change_in_Asset_Mgr_Spread_All","Change_in_Lev_Money_Long_All","Change_in_Lev_Money_Short_All","Change_in_Lev_Money_Spread_All","

Split the `col_row` to get a list of columns and remove the extra quotation marks

In [8]:
cols_cleaned = []

cols = col_row.split(",")
for cl in cols:
    cols_cleaned.append(cl[1:-1])
    
cols_cleaned

['Market_and_Exchange_Names',
 'As_of_Date_In_Form_YYMMDD',
 'Report_Date_as_YYYY-MM-DD',
 'CFTC_Contract_Market_Code',
 'CFTC_Market_Code',
 'CFTC_Region_Code',
 'CFTC_Commodity_Code',
 'Open_Interest_All',
 'Dealer_Positions_Long_All',
 'Dealer_Positions_Short_All',
 'Dealer_Positions_Spread_All',
 'Asset_Mgr_Positions_Long_All',
 'Asset_Mgr_Positions_Short_All',
 'Asset_Mgr_Positions_Spread_All',
 'Lev_Money_Positions_Long_All',
 'Lev_Money_Positions_Short_All',
 'Lev_Money_Positions_Spread_All',
 'Other_Rept_Positions_Long_All',
 'Other_Rept_Positions_Short_All',
 'Other_Rept_Positions_Spread_All',
 'Tot_Rept_Positions_Long_All',
 'Tot_Rept_Positions_Short_All',
 'NonRept_Positions_Long_All',
 'NonRept_Positions_Short_All',
 'Change_in_Open_Interest_All',
 'Change_in_Dealer_Long_All',
 'Change_in_Dealer_Short_All',
 'Change_in_Dealer_Spread_All',
 'Change_in_Asset_Mgr_Long_All',
 'Change_in_Asset_Mgr_Short_All',
 'Change_in_Asset_Mgr_Spread_All',
 'Change_in_Lev_Money_Long_All',
 '

Generating new columns

In [9]:
cot_split = cot.select("*") # Copy the data frame

#split the initial column "value"
split_cols = F.split(cot_split['value'], ",")

for key, value in enumerate(cols_cleaned):
    cot_split = cot_split.withColumn(value, split_cols.getItem(key))

cot_split.select(cols_cleaned[:4]).show()

[Stage 1:>                                                          (0 + 1) / 1]

+-------------------------+-------------------------+-------------------------+-------------------------+
|Market_and_Exchange_Names|As_of_Date_In_Form_YYMMDD|Report_Date_as_YYYY-MM-DD|CFTC_Contract_Market_Code|
+-------------------------+-------------------------+-------------------------+-------------------------+
|     "Market_and_Excha...|     "As_of_Date_In_Fo...|     "Report_Date_as_Y...|     "CFTC_Contract_Ma...|
|     "CANADIAN DOLLAR ...|                   211228|               2021-12-28|                   090741|
|     "CANADIAN DOLLAR ...|                   211221|               2021-12-21|                   090741|
|     "CANADIAN DOLLAR ...|                   211214|               2021-12-14|                   090741|
|     "CANADIAN DOLLAR ...|                   211207|               2021-12-07|                   090741|
|     "CANADIAN DOLLAR ...|                   211130|               2021-11-30|                   090741|
|     "CANADIAN DOLLAR ...|                   

                                                                                

Add index column, use index column to filter out the first row, drop the first column and index column

In [10]:
# Add index column
cot_split = cot_split.withColumn('index', F.monotonically_increasing_id())

# filter out first column and drop value and index columns
cot_split = cot_split \
        .filter(cot_split['index'] >= 1) \
        .drop('value', 'index')


# Removing any leading or trailing spaces
for col_name in cot_split.columns:
    cot_split = cot_split.withColumn(col_name, F.trim(cot_split[col_name]))

cot_split.select(cols_cleaned[:4]).show()

[Stage 2:>                                                          (0 + 1) / 1]

+-------------------------+-------------------------+-------------------------+-------------------------+
|Market_and_Exchange_Names|As_of_Date_In_Form_YYMMDD|Report_Date_as_YYYY-MM-DD|CFTC_Contract_Market_Code|
+-------------------------+-------------------------+-------------------------+-------------------------+
|     "CANADIAN DOLLAR ...|                   211228|               2021-12-28|                   090741|
|     "CANADIAN DOLLAR ...|                   211221|               2021-12-21|                   090741|
|     "CANADIAN DOLLAR ...|                   211214|               2021-12-14|                   090741|
|     "CANADIAN DOLLAR ...|                   211207|               2021-12-07|                   090741|
|     "CANADIAN DOLLAR ...|                   211130|               2021-11-30|                   090741|
|     "CANADIAN DOLLAR ...|                   211123|               2021-11-23|                   090741|
|     "CANADIAN DOLLAR ...|                   

                                                                                

In [46]:
cot_panda = cot_split.toPandas()

                                                                                

In [47]:
cot_panda = cot_panda[cot_panda['Open_Interest_All'].str.isnumeric()]
cot_panda['Open_Interest_All'] = cot_panda['Open_Interest_All'].astype(int)
cot_panda

Unnamed: 0,Market_and_Exchange_Names,As_of_Date_In_Form_YYMMDD,Report_Date_as_YYYY-MM-DD,CFTC_Contract_Market_Code,CFTC_Market_Code,CFTC_Region_Code,CFTC_Commodity_Code,Open_Interest_All,Dealer_Positions_Long_All,Dealer_Positions_Short_All,...,Conc_Net_LE_4_TDR_Long_All,Conc_Net_LE_4_TDR_Short_All,Conc_Net_LE_8_TDR_Long_All,Conc_Net_LE_8_TDR_Short_All,Contract_Units,CFTC_Contract_Market_Code_Quotes,CFTC_Market_Code_Quotes,CFTC_Commodity_Code_Quotes,CFTC_SubGroup_Code,FutOnly_or_Combined
0,"""CANADIAN DOLLAR - CHICAGO MERCANTILE EXCHANGE""",211228,2021-12-28,090741,CME,00,090,138692,33018,15462,...,32.8,19.5,43.5,31.3,"""(CONTRACTS OF CAD 100","000)""","""090741""","""CME ""","""090 ""","""F10"""
1,"""CANADIAN DOLLAR - CHICAGO MERCANTILE EXCHANGE""",211221,2021-12-21,090741,CME,00,090,145562,34908,14956,...,31.2,19.7,42.1,31.3,"""(CONTRACTS OF CAD 100","000)""","""090741""","""CME ""","""090 ""","""F10"""
2,"""CANADIAN DOLLAR - CHICAGO MERCANTILE EXCHANGE""",211214,2021-12-14,090741,CME,00,090,186638,6499,22589,...,15.3,18.5,25.1,26.7,"""(CONTRACTS OF CAD 100","000)""","""090741""","""CME ""","""090 ""","""F10"""
3,"""CANADIAN DOLLAR - CHICAGO MERCANTILE EXCHANGE""",211207,2021-12-07,090741,CME,00,090,155078,14851,23021,...,21.4,22.8,32.3,33.1,"""(CONTRACTS OF CAD 100","000)""","""090741""","""CME ""","""090 ""","""F10"""
4,"""CANADIAN DOLLAR - CHICAGO MERCANTILE EXCHANGE""",211130,2021-11-30,090741,CME,00,090,148375,16633,23782,...,24.9,27.9,38.2,38.5,"""(CONTRACTS OF CAD 100","000)""","""090741""","""CME ""","""090 ""","""F10"""
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24134,"""BLOOMBERG COMMODITY INDEX - CHICAGO BOARD OF ...",220201,2022-02-01,221602,CBT,00,221,48941,1770,14995,...,55.9,84.4,76.2,94.1,"""($100 X INDEX)""","""221602""","""CBT ""","""221 ""","""F90""","""FutOnly"""
24135,"""BLOOMBERG COMMODITY INDEX - CHICAGO BOARD OF ...",220125,2022-01-25,221602,CBT,00,221,49419,2865,14980,...,55.4,77.7,75.5,88.2,"""($100 X INDEX)""","""221602""","""CBT ""","""221 ""","""F90""","""FutOnly"""
24136,"""BLOOMBERG COMMODITY INDEX - CHICAGO BOARD OF ...",220118,2022-01-18,221602,CBT,00,221,48271,3615,14995,...,57.8,85.9,76.6,93.2,"""($100 X INDEX)""","""221602""","""CBT ""","""221 ""","""F90""","""FutOnly"""
24137,"""BLOOMBERG COMMODITY INDEX - CHICAGO BOARD OF ...",220111,2022-01-11,221602,CBT,00,221,53734,3615,15015,...,62.4,84.9,79.3,93.9,"""($100 X INDEX)""","""221602""","""CBT ""","""221 ""","""F90""","""FutOnly"""


In [48]:
cot_panda.to_csv('cot_panda.csv', index=False)

In [118]:
schema = types.StructType([
    types.StructField('Market_and_Exchange_Names', types.StringType(), True),
    types.StructField('As_of_Date_In_Form_YYMMDD', types.StringType(), True), 
    types.StructField('Report_Date_as_YYYY-MM-DD', types.DateType(), True), 
    types.StructField('CFTC_Contract_Market_Code', types.StringType(), True),
    types.StructField('CFTC_Market_Code', types.StringType(), True),
    types.StructField('CFTC_Region_Code', types.StringType(), True), 
    types.StructField('CFTC_Commodity_Code', types.StringType(), True),
    types.StructField('Open_Interest_All', types.IntegerType(), True),
    types.StructField('Dealer_Positions_Long_All', types.IntegerType(), True),
    types.StructField('Dealer_Positions_Short_All', types.IntegerType(), True),
    types.StructField('Dealer_Positions_Spread_All', types.IntegerType(), True),
    types.StructField('Asset_Mgr_Positions_Long_All', types.IntegerType(), True),
    types.StructField('Asset_Mgr_Positions_Short_All', types.IntegerType(), True),
    types.StructField('Asset_Mgr_Positions_Spread_All', types.IntegerType(), True),
    types.StructField('Lev_Money_Positions_Long_All', types.IntegerType(), True),
    types.StructField('Lev_Money_Positions_Short_All', types.IntegerType(), True),
    types.StructField('Lev_Money_Positions_Spread_All', types.IntegerType(), True),
    types.StructField('Other_Rept_Positions_Long_All', types.IntegerType(), True),
    types.StructField('Other_Rept_Positions_Short_All', types.IntegerType(), True),
    types.StructField('Other_Rept_Positions_Spread_All', types.IntegerType(), True),
    types.StructField('Tot_Rept_Positions_Long_All', types.IntegerType(), True),
    types.StructField('Tot_Rept_Positions_Short_All', types.IntegerType(), True),
    types.StructField('NonRept_Positions_Long_All', types.IntegerType(), True),
    types.StructField('NonRept_Positions_Short_All', types.IntegerType(), True),
    types.StructField('Change_in_Open_Interest_All', types.IntegerType(), True),
    types.StructField('Change_in_Dealer_Long_All', types.IntegerType(), True),
    types.StructField('Change_in_Dealer_Short_All', types.IntegerType(), True),
    types.StructField('Change_in_Dealer_Spread_All', types.IntegerType(), True),
    types.StructField('Change_in_Asset_Mgr_Long_All', types.IntegerType(), True),
    types.StructField('Change_in_Asset_Mgr_Short_All', types.IntegerType(), True),
    types.StructField('Change_in_Asset_Mgr_Spread_All', types.IntegerType(), True),
    types.StructField('Change_in_Lev_Money_Long_All', types.IntegerType(), True),
    types.StructField('Change_in_Lev_Money_Short_All', types.IntegerType(), True),
    types.StructField('Change_in_Lev_Money_Spread_All', types.IntegerType(), True),
    types.StructField('Change_in_Other_Rept_Long_All', types.IntegerType(), True),
    types.StructField('Change_in_Other_Rept_Short_All', types.IntegerType(), True),
    types.StructField('Change_in_Other_Rept_Spread_All', types.IntegerType(), True),
    types.StructField('Change_in_Tot_Rept_Long_All', types.IntegerType(), True),
    types.StructField('Change_in_Tot_Rept_Short_All', types.IntegerType(), True),
    types.StructField('Change_in_NonRept_Long_All', types.IntegerType(), True),
    types.StructField('Change_in_NonRept_Short_All', types.IntegerType(), True),
    types.StructField('Pct_of_Open_Interest_All', types.IntegerType(), True),
    types.StructField('Pct_of_OI_Dealer_Long_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Dealer_Short_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Dealer_Spread_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Asset_Mgr_Long_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Asset_Mgr_Short_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Asset_Mgr_Spread_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Lev_Money_Long_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Lev_Money_Short_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Lev_Money_Spread_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Other_Rept_Long_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Other_Rept_Short_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Other_Rept_Spread_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Tot_Rept_Long_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_Tot_Rept_Short_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_NonRept_Long_All', types.FloatType(), True),
    types.StructField('Pct_of_OI_NonRept_Short_All', types.FloatType(), True),
    types.StructField('Traders_Tot_All', types.IntegerType(), True),
    types.StructField('Traders_Dealer_Long_All', types.IntegerType(), True),
    types.StructField('Traders_Dealer_Short_All', types.IntegerType(), True),
    types.StructField('Traders_Dealer_Spread_All', types.IntegerType(), True),
    types.StructField('Traders_Asset_Mgr_Long_All', types.IntegerType(), True),
    types.StructField('Traders_Asset_Mgr_Short_All', types.IntegerType(), True),
    types.StructField('Traders_Asset_Mgr_Spread_All', types.IntegerType(), True),
    types.StructField('Traders_Lev_Money_Long_All', types.IntegerType(), True),
    types.StructField('Traders_Lev_Money_Short_All', types.IntegerType(), True),
    types.StructField('Traders_Lev_Money_Spread_All', types.IntegerType(), True),
    types.StructField('Traders_Other_Rept_Long_All', types.IntegerType(), True),
    types.StructField('Traders_Other_Rept_Short_All', types.IntegerType(), True),
    types.StructField('Traders_Other_Rept_Spread_All', types.IntegerType(), True),
    types.StructField('Traders_Tot_Rept_Long_All', types.StringType(), True),
    types.StructField('Traders_Tot_Rept_Short_All', types.StringType(), True),
    types.StructField('Conc_Gross_LE_4_TDR_Long_All', types.FloatType(), True),
    types.StructField('Conc_Gross_LE_4_TDR_Short_All', types.FloatType(), True),
    types.StructField('Conc_Gross_LE_8_TDR_Long_All', types.FloatType(), True),
    types.StructField('Conc_Gross_LE_8_TDR_Short_All', types.FloatType(), True),
    types.StructField('Conc_Net_LE_4_TDR_Long_All', types.FloatType(), True),
    types.StructField('Conc_Net_LE_4_TDR_Short_All', types.FloatType(), True),
    types.StructField('Conc_Net_LE_8_TDR_Long_All', types.FloatType(), True),
    types.StructField('Conc_Net_LE_8_TDR_Short_All', types.FloatType(), True),
    types.StructField('Contract_Units', types.StringType(), True),
    types.StructField('CFTC_Contract_Market_Code_Quotes', types.StringType(), True),
    types.StructField('CFTC_Market_Code_Quotes', types.StringType(), True),
    types.StructField('CFTC_Commodity_Code_Quotes', types.StringType(), True),
    types.StructField('CFTC_SubGroup_Code', types.StringType(), True),
    types.StructField('FutOnly_or_Combined', types.StringType(), True)    
])

In [119]:
cot_panda_sp = spark.read \
        .option('header', 'true') \
        .schema(schema) \
        .csv('cot_pd.csv')

In [122]:
cot_panda_sp.select(cols_cleaned[81:]).show()

+--------------------+--------------------------------+-----------------------+--------------------------+------------------+-------------------+
|      Contract_Units|CFTC_Contract_Market_Code_Quotes|CFTC_Market_Code_Quotes|CFTC_Commodity_Code_Quotes|CFTC_SubGroup_Code|FutOnly_or_Combined|
+--------------------+--------------------------------+-----------------------+--------------------------+------------------+-------------------+
|"""(CONTRACTS OF ...|                          000)""|           """090741"""|                """CME """|        """090 """|          """F10"""|
|"""(CONTRACTS OF ...|                          000)""|           """090741"""|                """CME """|        """090 """|          """F10"""|
|"""(CONTRACTS OF ...|                          000)""|           """090741"""|                """CME """|        """090 """|          """F10"""|
|"""(CONTRACTS OF ...|                          000)""|           """090741"""|                """CME """|        """090 """

Remove the extra quotation marks and brackets on `Market_and_Exchange_Names`, `Contract_Units`,  `CFTC_Contract_Market_Code_Quotes`, `CFTC_Commodity_Code_Quotes`, `CFTC_SubGroup_Code`,  `FutOnly_or_Combined` column values

In [125]:
error_cols = ['Market_and_Exchange_Names', 'Contract_Units', 'CFTC_Contract_Market_Code_Quotes', 'CFTC_Commodity_Code_Quotes', 'CFTC_SubGroup_Code', 'FutOnly_or_Combined']

for column in error_cols:
    cot_panda_sp = cot_panda_sp \
        .withColumn(column, F.regexp_replace(cot_panda_sp[column], '"', "")) \

cot_panda_sp.select(error_cols[:3]).show()

+-------------------------+--------------------+--------------------------------+
|Market_and_Exchange_Names|      Contract_Units|CFTC_Contract_Market_Code_Quotes|
+-------------------------+--------------------+--------------------------------+
|     CANADIAN DOLLAR -...|(CONTRACTS OF CAD...|                            000)|
|     CANADIAN DOLLAR -...|(CONTRACTS OF CAD...|                            000)|
|     CANADIAN DOLLAR -...|(CONTRACTS OF CAD...|                            000)|
|     CANADIAN DOLLAR -...|(CONTRACTS OF CAD...|                            000)|
|     CANADIAN DOLLAR -...|(CONTRACTS OF CAD...|                            000)|
|     CANADIAN DOLLAR -...|(CONTRACTS OF CAD...|                            000)|
|     CANADIAN DOLLAR -...|(CONTRACTS OF CAD...|                            000)|
|     CANADIAN DOLLAR -...|(CONTRACTS OF CAD...|                            000)|
|     CANADIAN DOLLAR -...|(CONTRACTS OF CAD...|                            000)|
|     CANADIAN D

In [126]:
cot_panda_sp.printSchema()

root
 |-- Market_and_Exchange_Names: string (nullable = true)
 |-- As_of_Date_In_Form_YYMMDD: string (nullable = true)
 |-- Report_Date_as_YYYY-MM-DD: date (nullable = true)
 |-- CFTC_Contract_Market_Code: string (nullable = true)
 |-- CFTC_Market_Code: string (nullable = true)
 |-- CFTC_Region_Code: string (nullable = true)
 |-- CFTC_Commodity_Code: string (nullable = true)
 |-- Open_Interest_All: integer (nullable = true)
 |-- Dealer_Positions_Long_All: integer (nullable = true)
 |-- Dealer_Positions_Short_All: integer (nullable = true)
 |-- Dealer_Positions_Spread_All: integer (nullable = true)
 |-- Asset_Mgr_Positions_Long_All: integer (nullable = true)
 |-- Asset_Mgr_Positions_Short_All: integer (nullable = true)
 |-- Asset_Mgr_Positions_Spread_All: integer (nullable = true)
 |-- Lev_Money_Positions_Long_All: integer (nullable = true)
 |-- Lev_Money_Positions_Short_All: integer (nullable = true)
 |-- Lev_Money_Positions_Spread_All: integer (nullable = true)
 |-- Other_Rept_Positio

Writing to file the cleaned version with correct data types

In [127]:
cot_panda_sp.write.parquet(f'gs://{gcp_bucket_name}/cleaned/pq', mode='overwrite')

                                                                                

In [128]:
traders = spark.read.parquet(f'gs://{gcp_bucket_name}/cleaned/pq/*')

                                                                                

In [129]:
traders.printSchema()

root
 |-- Market_and_Exchange_Names: string (nullable = true)
 |-- As_of_Date_In_Form_YYMMDD: string (nullable = true)
 |-- Report_Date_as_YYYY-MM-DD: date (nullable = true)
 |-- CFTC_Contract_Market_Code: string (nullable = true)
 |-- CFTC_Market_Code: string (nullable = true)
 |-- CFTC_Region_Code: string (nullable = true)
 |-- CFTC_Commodity_Code: string (nullable = true)
 |-- Open_Interest_All: integer (nullable = true)
 |-- Dealer_Positions_Long_All: integer (nullable = true)
 |-- Dealer_Positions_Short_All: integer (nullable = true)
 |-- Dealer_Positions_Spread_All: integer (nullable = true)
 |-- Asset_Mgr_Positions_Long_All: integer (nullable = true)
 |-- Asset_Mgr_Positions_Short_All: integer (nullable = true)
 |-- Asset_Mgr_Positions_Spread_All: integer (nullable = true)
 |-- Lev_Money_Positions_Long_All: integer (nullable = true)
 |-- Lev_Money_Positions_Short_All: integer (nullable = true)
 |-- Lev_Money_Positions_Spread_All: integer (nullable = true)
 |-- Other_Rept_Positio

In [132]:
traders.select(cols_cleaned[:4]).show()

[Stage 63:>                                                         (0 + 1) / 1]

+-------------------------+-------------------------+-------------------------+-------------------------+
|Market_and_Exchange_Names|As_of_Date_In_Form_YYMMDD|Report_Date_as_YYYY-MM-DD|CFTC_Contract_Market_Code|
+-------------------------+-------------------------+-------------------------+-------------------------+
|     DOW JONES INDUSTR...|                   201208|               2020-12-08|                   124603|
|     DOW JONES INDUSTR...|                   201201|               2020-12-01|                   124603|
|     DOW JONES INDUSTR...|                   201124|               2020-11-24|                   124603|
|     DOW JONES INDUSTR...|                   201117|               2020-11-17|                   124603|
|     DOW JONES INDUSTR...|                   201110|               2020-11-10|                   124603|
|     DOW JONES INDUSTR...|                   201103|               2020-11-03|                   124603|
|     DOW JONES INDUSTR...|                   

                                                                                