In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import functions as F

import os
import pandas as pd
from dotenv import load_dotenv

Configuration

In [7]:
load_dotenv()
credentials_location = os.getenv('GCP_CREDENTIALS_LOCATION')
gcp_bucket_name = os.getenv('GCP_BUCKET')

In [3]:
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "/Users/Manu/lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

Context

In [4]:
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", 'true')

22/04/11 04:33:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Session

In [5]:
spark = SparkSession.builder \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.11:0.12.2", conf=sc.getConf()) \
    .getOrCreate()

Reading data from google cloug storage

In [8]:
cot = spark.read \
    .text(f'gs://{gcp_bucket_name}/raw/*')

In [9]:
cot.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+
|               value|
+--------------------+
|"Market_and_Excha...|
|"CANADIAN DOLLAR ...|
|"CANADIAN DOLLAR ...|
|"CANADIAN DOLLAR ...|
|"CANADIAN DOLLAR ...|
+--------------------+
only showing top 5 rows



                                                                                

In [8]:
cot.count()

                                                                                

24140

In [9]:
cot.collect()[:2]

                                                                                

[Row(value='"Market_and_Exchange_Names","As_of_Date_In_Form_YYMMDD","Report_Date_as_YYYY-MM-DD","CFTC_Contract_Market_Code","CFTC_Market_Code","CFTC_Region_Code","CFTC_Commodity_Code","Open_Interest_All","Dealer_Positions_Long_All","Dealer_Positions_Short_All","Dealer_Positions_Spread_All","Asset_Mgr_Positions_Long_All","Asset_Mgr_Positions_Short_All","Asset_Mgr_Positions_Spread_All","Lev_Money_Positions_Long_All","Lev_Money_Positions_Short_All","Lev_Money_Positions_Spread_All","Other_Rept_Positions_Long_All","Other_Rept_Positions_Short_All","Other_Rept_Positions_Spread_All","Tot_Rept_Positions_Long_All","Tot_Rept_Positions_Short_All","NonRept_Positions_Long_All","NonRept_Positions_Short_All","Change_in_Open_Interest_All","Change_in_Dealer_Long_All","Change_in_Dealer_Short_All","Change_in_Dealer_Spread_All","Change_in_Asset_Mgr_Long_All","Change_in_Asset_Mgr_Short_All","Change_in_Asset_Mgr_Spread_All","Change_in_Lev_Money_Long_All","Change_in_Lev_Money_Short_All","Change_in_Lev_Money_S

1. All colums have been combined into one column called `value`
2. Need to split the column

In [10]:
split_cols = F.split(cot['value'], ',')

df_split = cot.withColumn('Market_and_Exchange_Names', split_cols.getItem(0)) \
    .withColumn('As_of_Date_In_Form_YYMMDD', split_cols.getItem(1)) \
    .withColumn('Report_Date_as_MM_DD_YYYY', split_cols.getItem(2)) \
    .withColumn("index",F.monotonically_increasing_id())


# filter out the first row, Delete value and index columns
df_split = df_split.filter(df_split.index >= 1) \
        .drop('value', 'index') \
        .show()


[Stage 5:>                                                          (0 + 1) / 1]

+-------------------------+-------------------------+-------------------------+
|Market_and_Exchange_Names|As_of_Date_In_Form_YYMMDD|Report_Date_as_MM_DD_YYYY|
+-------------------------+-------------------------+-------------------------+
|     "CANADIAN DOLLAR ...|                   211228|               2021-12-28|
|     "CANADIAN DOLLAR ...|                   211221|               2021-12-21|
|     "CANADIAN DOLLAR ...|                   211214|               2021-12-14|
|     "CANADIAN DOLLAR ...|                   211207|               2021-12-07|
|     "CANADIAN DOLLAR ...|                   211130|               2021-11-30|
|     "CANADIAN DOLLAR ...|                   211123|               2021-11-23|
|     "CANADIAN DOLLAR ...|                   211116|               2021-11-16|
|     "CANADIAN DOLLAR ...|                   211109|               2021-11-09|
|     "CANADIAN DOLLAR ...|                   211102|               2021-11-02|
|     "CANADIAN DOLLAR ...|             

                                                                                