In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from multiprocessing import cpu_count

In [2]:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
IP = "192.168.1.1"  # rember to set the real IP!
cores = cpu_count()

In [4]:
spark_session = SparkSession.builder\
        .master(f"spark://{IP}:7077")\
        .appName("ECAtemp")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", cores)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

sc = spark_session.sparkContext

sc.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/31 21:23:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


All the data should have been extracted to HDFS. Load the data:

In [5]:
data_root = f"hdfs://{IP}:9000/data/"
rdds = {}

for data_id in ("tx", "tn"):
    data_dir = data_root + data_id
    rdds[data_id] = sc.textFile(data_dir)

Preprocess the data:

In [6]:
def Preprocess(rdd):

    # read every file excluding first 20 lines
    rdd = rdd\
        .zipWithIndex()\
        .filter(lambda x: x[1] >= 20)\
        .map(lambda x: (x[0], ))\
        .toDF()

    # get column names from 1st row
    column_names = rdd.first()["_1"].split(",")
    column_names = [x.strip() for x in column_names]
    # drop 1st row
    rdd = rdd.withColumn("index", monotonically_increasing_id())
    rdd = rdd\
        .filter(rdd.index > 0)\
        .drop("index")

    # form columns
    for i,name in enumerate(column_names):
        rdd = rdd.withColumn(name, split(rdd["_1"], ",").getItem(i))
        rdd = rdd.withColumn(name, rdd[name].cast("int"))
    rdd = rdd.drop("_1")

    return rdd

In [7]:
for data_id in ("tx", "tn"):
    rdds[data_id] = Preprocess(rdds[data_id])
    rdds[data_id].show()

                                                                                

+-----+-----+--------+----+----+
|STAID|SOUID|    DATE|  TX|Q_TX|
+-----+-----+--------+----+----+
|    2|36438|19000101|  18|   0|
|    2|36438|19000102| -10|   0|
|    2|36438|19000103| -20|   0|
|    2|36438|19000104| -30|   0|
|    2|36438|19000105| -60|   0|
|    2|36438|19000106|-150|   0|
|    2|36438|19000107| -90|   0|
|    2|36438|19000108| -40|   0|
|    2|36438|19000109| -40|   0|
|    2|36438|19000110| -10|   0|
|    2|36438|19000111| -24|   0|
|    2|36438|19000112|-120|   0|
|    2|36438|19000113|-100|   0|
|    2|36438|19000114| -60|   0|
|    2|36438|19000115| -70|   0|
|    2|36438|19000116| -10|   0|
|    2|36438|19000117| -20|   0|
|    2|36438|19000118| -20|   0|
|    2|36438|19000119| -34|   0|
|    2|36438|19000120| -20|   0|
+-----+-----+--------+----+----+
only showing top 20 rows



                                                                                

+-----+-----+--------+----+----+
|STAID|SOUID|    DATE|  TN|Q_TN|
+-----+-----+--------+----+----+
|    2|36439|19000101| -30|   0|
|    2|36439|19000102| -50|   0|
|    2|36439|19000103| -70|   0|
|    2|36439|19000104|-110|   0|
|    2|36439|19000105|-190|   0|
|    2|36439|19000106|-220|   0|
|    2|36439|19000107|-200|   0|
|    2|36439|19000108|-110|   0|
|    2|36439|19000109| -80|   0|
|    2|36439|19000110| -60|   0|
|    2|36439|19000111|-160|   0|
|    2|36439|19000112|-210|   0|
|    2|36439|19000113|-220|   0|
|    2|36439|19000114|-130|   0|
|    2|36439|19000115|-190|   0|
|    2|36439|19000116|-110|   0|
|    2|36439|19000117| -60|   0|
|    2|36439|19000118| -60|   0|
|    2|36439|19000119|-130|   0|
|    2|36439|19000120|-120|   0|
+-----+-----+--------+----+----+
only showing top 20 rows



In [8]:
spark_session.stop()