# Data manipulation for ARDL analysis

In this notebook, we will use Pyspark and aggregate monthly data to quarterly data so that we can merge with GDP data. 

In [1]:
%%configure -f
{
    "conf": {
        "spark.pyspark.python": "python3",
        "spark.pyspark.virtualenv.enabled": "true",
        "spark.pyspark.virtualenv.type":"native",
        "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv"
    }
}

In [21]:
from pyspark.sql import functions as F

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Load exchange rate and trade data

In [36]:
ex = spark.read.parquet("s3://trade-final-project-bucket/dataset/exchangerate2.parquet").persist()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [39]:
ex = (ex.withColumnRenamed("Code","country")
        .withColumnRenamed("Time","year_month")
        .withColumnRenamed("value","ex")
     )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [40]:
ex.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- Country_name: string (nullable = true)
 |-- country: long (nullable = true)
 |-- Country_name2: string (nullable = true)
 |-- Area: string (nullable = true)
 |-- year_month: string (nullable = true)
 |-- ex: double (nullable = true)

In [41]:
ex.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------+-------------+--------------------+----------+------------------+
|        Country_name|country|Country_name2|                Area|year_month|                ex|
+--------------------+-------+-------------+--------------------+----------+------------------+
|Afghanistan, Isla...|    130|  Afghanistan|                Asia|    198801|0.3092632225995748|
|             Albania|    229|      Albania|Central_and_East_...|    198801|              null|
+--------------------+-------+-------------+--------------------+----------+------------------+
only showing top 2 rows

In [42]:
data = spark.read.parquet("s3://trade-final-project-bucket/dataset/trades.parquet")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [43]:
data.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- year_month: long (nullable = true)
 |-- export_import: long (nullable = true)
 |-- country: long (nullable = true)
 |-- hs9: long (nullable = true)
 |-- q1: long (nullable = true)
 |-- q2: long (nullable = true)
 |-- value: long (nullable = true)
 |-- hs6: string (nullable = true)
 |-- hs2: string (nullable = true)

In [44]:
data = data.join(ex, on=['country', 'year_month'], how='inner')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [45]:
data = data.withColumn('year_month_str', F.col('year_month').cast('string'))
data = (data.withColumn('quarter', F.quarter(F.to_date('year_month_str', 'yyyyMM')))
            .withColumn('year', F.year(F.to_date('year_month_str', 'yyyyMM')))
       )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [46]:
data.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- country: long (nullable = true)
 |-- year_month: long (nullable = true)
 |-- export_import: long (nullable = true)
 |-- hs9: long (nullable = true)
 |-- q1: long (nullable = true)
 |-- q2: long (nullable = true)
 |-- value: long (nullable = true)
 |-- hs6: string (nullable = true)
 |-- hs2: string (nullable = true)
 |-- Country_name: string (nullable = true)
 |-- Country_name2: string (nullable = true)
 |-- Area: string (nullable = true)
 |-- ex: double (nullable = true)
 |-- year_month_str: string (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- year: integer (nullable = true)

## Aggregate from monthly data to quarterly data. 

In [97]:
data_g = (data.groupby("year", "quarter", "export_import", "country", "Country_name", "hs2", "hs6", "hs9")
          .agg(F.mean("ex").alias("mean_ex"), F.sum("value").alias("sum_value")))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [98]:
data_g.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- year: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- export_import: long (nullable = true)
 |-- country: long (nullable = true)
 |-- Country_name: string (nullable = true)
 |-- hs2: string (nullable = true)
 |-- hs6: string (nullable = true)
 |-- hs9: long (nullable = true)
 |-- mean_ex: double (nullable = true)
 |-- sum_value: long (nullable = true)

In [99]:
data_g.head(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(year=1989, quarter=1, export_import=2, country=304, Country_name='United States', hs2='61', hs6='610610', hs9=610610012, mean_ex=0.007783161720169594, sum_value=4274), Row(year=1990, quarter=2, export_import=1, country=113, Country_name='Malaysia', hs2='76', hs6='760810', hs9=760810000, mean_ex=0.017544401698794846, sum_value=22435)]

## Load GDP data and merge with previous dataframe 

In [100]:
gdp = spark.read.parquet("s3://trade-final-project-bucket/dataset/gdp_quarter.parquet").persist()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [101]:
gdp = (gdp.withColumn("quarter", F.substring(F.col("Time"), 2, 1).cast("integer"))
          .withColumn("year", F.substring(F.col("Time"), 4, 4).cast("integer"))
          .withColumnRenamed("value", "gdp")
      )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [102]:
gdp.head(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(Country_name='Argentina', Code=413, Country_name2='Argentina', Area='Middle_and_South_America', Time='Q1_1988', gdp=None, quarter=1, year=1988), Row(Country_name='Armenia, Rep. of', Code=151, Country_name2='Armenia', Area='Central_and_East_Europe_Russia', Time='Q1_1988', gdp=None, quarter=1, year=1988)]

In [103]:
gdp = gdp.withColumnRenamed("Code","country")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [104]:
data_g = data_g.join(gdp, on=['country', 'quarter', 'year'], how='inner')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [105]:
data_g = (data_g.withColumnRenamed("mean_ex","ex")
            .withColumnRenamed("sum_value","value")
         )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [106]:
data_g = data_g.withColumn('year_quarter', F.concat(F.col("year"), F.lit("_"), F.col("quarter")))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [107]:
data_g.head(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(country=143, quarter=4, year=2013, export_import=2, Country_name='Israel', hs2='85', hs6='853290', hs9=853290000, ex=0.03572323232323232, value=2352, Country_name='Israel', Country_name2='Israel', Area='Middle_East', Time='Q4_2013', gdp=274198.0, year_quarter='2013_4'), Row(country=304, quarter=2, year=2014, export_import=1, Country_name='United States', hs2='29', hs6='290629', hs9=290629000, ex=0.009836277745421602, value=1761, Country_name='United States', Country_name2='United_States_of_America', Area='North_America', Time='Q2_2014', gdp=4365675.8, year_quarter='2014_2')]

In [108]:
data_g.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- country: long (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- export_import: long (nullable = true)
 |-- Country_name: string (nullable = true)
 |-- hs2: string (nullable = true)
 |-- hs6: string (nullable = true)
 |-- hs9: long (nullable = true)
 |-- ex: double (nullable = true)
 |-- value: long (nullable = true)
 |-- Country_name: string (nullable = true)
 |-- Country_name2: string (nullable = true)
 |-- Area: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- gdp: double (nullable = true)
 |-- year_quarter: string (nullable = true)

In [124]:
data_g = (data_g.withColumn("value_log", F.log(F.col("value")))
                .withColumn("ex_log", F.log(F.col("ex")))
                .withColumn("gdp_log", F.log(F.col("gdp")))
         )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Extract column and save

In [125]:
data_sub = (data_g[["country", "year_quarter", "export_import",  "hs2", "hs6", 
                  "hs9", "value_log", "ex_log", "gdp_log"]])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [126]:
data_sub.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- country: long (nullable = true)
 |-- year_quarter: string (nullable = true)
 |-- export_import: long (nullable = true)
 |-- hs2: string (nullable = true)
 |-- hs6: string (nullable = true)
 |-- hs9: long (nullable = true)
 |-- value_log: double (nullable = true)
 |-- ex_log: double (nullable = true)
 |-- gdp_log: double (nullable = true)

In [128]:
data_sub.write.parquet("s3://trade-final-project-bucket/dataset/trades_quarter.parquet", mode="overwrite")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…