In [None]:
## 0.Question  

- Use this dataset: ` https://github.com/erkansirin78/datasets/raw/master/dirty_store_transactions.csv ` 
do the following tasks:

### 1. 
Clean data

### 2. 
Write clean data to hive  `test1.clean_transactions`, format should be orc.

### 3. 
Write clean data to Postgresql `traindb.public.clean_transactions`. 

### 4. 
Write clean data to hdfs `/user/train/spark_odev_transaction` in parquet format.


In [None]:
### 1. Start Jupter Lab

```
[train@trainvm ~]$ source ~/venvspark/bin/activate
```

```
[train@trainvm ~]$ jupyter lab
```

### 2. Start Hadoop services

```
[train@trainvm ~]$ start-all.sh
```


In [96]:
import findspark
import configparser
from pyspark.sql import SparkSession,functions as F
from pyspark.sql.functions import *
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import *

In [97]:
findspark.init("/opt/manual/spark")

In [98]:
spark = (SparkSession.builder 
         .appName("homework3") 
         .master("yarn") 
         .enableHiveSupport() 
         .getOrCreate()
)

In [99]:
'''
[train@trainvm ~]$ ls

[train@trainvm ~]$ cd datasets

[train@trainvm datasets]$ ls

'''

'\n[train@trainvm ~]$ ls\n\n[train@trainvm ~]$ cd datasets\n\n[train@trainvm datasets]$ ls\n\n\n'

In [100]:
! wget -P /home/train/datasets/ https://github.com/erkansirin78/datasets/raw/master/dirty_store_transactions.csv 

--2023-04-03 16:47:04--  https://github.com/erkansirin78/datasets/raw/master/dirty_store_transactions.csv
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/erkansirin78/datasets/master/dirty_store_transactions.csv [following]
--2023-04-03 16:47:05--  https://raw.githubusercontent.com/erkansirin78/datasets/master/dirty_store_transactions.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2609524 (2.5M) [text/plain]
Saving to: ‘/home/train/datasets/dirty_store_transactions.csv’


2023-04-03 16:47:06 (2.54 MB/s) - ‘/home/train/datasets/dirty_store_transactions.csv’ saved [2609524/2609524]



In [101]:
df = spark.read \
     .format("csv") \
     .option("header", True) \
     .option("sep", ",") \
     .option("inferSchema", True) \
     .load("file:///home/train/datasets/dirty_store_transactions.csv")

In [102]:
df.show(5)

+--------+--------------+----------------+----------+---+------+--------+------+----------+
|STORE_ID|STORE_LOCATION|PRODUCT_CATEGORY|PRODUCT_ID|MRP|    CP|DISCOUNT|    SP|      Date|
+--------+--------------+----------------+----------+---+------+--------+------+----------+
|  YR7220|     New York(|     Electronics|  12254943|$31|$20.77|   $1.86|$29.14|2019-11-26|
|  YR7220|     New York+|       Furniture| 72619323C|$15| $9.75|    $1.5| $13.5|2019-11-26|
|  YR7220|     New York |     Electronics| 34161682B|$88|$62.48|    $4.4| $83.6|2019-11-26|
|  YR7220|     New York!|         Kitchen|  79411621|$91|$58.24|   $3.64|$87.36|2019-11-26|
|  YR7220|      New York|         Fashion| 39520263T|$85|   $51|   $2.55|$82.45|2019-11-26|
+--------+--------------+----------------+----------+---+------+--------+------+----------+
only showing top 5 rows



In [103]:
df.limit(5).toPandas()

Unnamed: 0,STORE_ID,STORE_LOCATION,PRODUCT_CATEGORY,PRODUCT_ID,MRP,CP,DISCOUNT,SP,Date
0,YR7220,New York(,Electronics,12254943,$31,$20.77,$1.86,$29.14,2019-11-26
1,YR7220,New York+,Furniture,72619323C,$15,$9.75,$1.5,$13.5,2019-11-26
2,YR7220,New York,Electronics,34161682B,$88,$62.48,$4.4,$83.6,2019-11-26
3,YR7220,New York!,Kitchen,79411621,$91,$58.24,$3.64,$87.36,2019-11-26
4,YR7220,New York,Fashion,39520263T,$85,$51,$2.55,$82.45,2019-11-26


In [104]:
df.select('STORE_LOCATION').distinct().collect()

                                                                                

[Row(STORE_LOCATION="Miami'"),
 Row(STORE_LOCATION='New York""'),
 Row(STORE_LOCATION='Washington""'),
 Row(STORE_LOCATION='Miami)'),
 Row(STORE_LOCATION='Houston%'),
 Row(STORE_LOCATION='Miami&'),
 Row(STORE_LOCATION='Houston$'),
 Row(STORE_LOCATION='New York$'),
 Row(STORE_LOCATION="New York'"),
 Row(STORE_LOCATION='New York+'),
 Row(STORE_LOCATION='New York('),
 Row(STORE_LOCATION='New York!'),
 Row(STORE_LOCATION='Miami#'),
 Row(STORE_LOCATION='Houston*'),
 Row(STORE_LOCATION='New York '),
 Row(STORE_LOCATION='New York&'),
 Row(STORE_LOCATION='Miami$'),
 Row(STORE_LOCATION='Denver$'),
 Row(STORE_LOCATION="Houston'"),
 Row(STORE_LOCATION='Denver+'),
 Row(STORE_LOCATION='Houston('),
 Row(STORE_LOCATION='Washington'),
 Row(STORE_LOCATION='New York)'),
 Row(STORE_LOCATION='Houston)'),
 Row(STORE_LOCATION="Denver'"),
 Row(STORE_LOCATION='Miami%'),
 Row(STORE_LOCATION='Miami '),
 Row(STORE_LOCATION='Washington$'),
 Row(STORE_LOCATION='Denver*'),
 Row(STORE_LOCATION='Miami+'),
 Row(STORE_

In [105]:
df2 = df.withColumn("MRP",F.regexp_replace(F.col("MRP"),"\$","").cast(IntegerType())) \
        .withColumn("CP",F.regexp_replace(F.col("CP"),"\$", "").cast(FloatType())) \
        .withColumn("DISCOUNT", F.regexp_replace(F.col("DISCOUNT"), "\$", "").cast(FloatType())) \
        .withColumn("SP",F.regexp_replace(F.col("SP"),"\$", "").cast(FloatType())) \
        .withColumn("Date",F.to_date(F.col("Date"), "yyyy-MM-dd")) \
        .withColumn("STORE_LOCATION",F.regexp_replace(F.col("STORE_LOCATION"),"[^A-Z a-z]","")) \
        .withColumn("PRODUCT_ID", F.regexp_replace(F.col("PRODUCT_ID"),"[^0-9]","").cast(IntegerType()))

In [106]:
df2.limit(5).toPandas()

Unnamed: 0,STORE_ID,STORE_LOCATION,PRODUCT_CATEGORY,PRODUCT_ID,MRP,CP,DISCOUNT,SP,Date
0,YR7220,New York,Electronics,12254943,31,20.77,1.86,29.139999,2019-11-26
1,YR7220,New York,Furniture,72619323,15,9.75,1.5,13.5,2019-11-26
2,YR7220,New York,Electronics,34161682,88,62.48,4.4,83.599998,2019-11-26
3,YR7220,New York,Kitchen,79411621,91,58.240002,3.64,87.360001,2019-11-26
4,YR7220,New York,Fashion,39520263,85,51.0,2.55,82.449997,2019-11-26


In [107]:
df2.printSchema()

root
 |-- STORE_ID: string (nullable = true)
 |-- STORE_LOCATION: string (nullable = true)
 |-- PRODUCT_CATEGORY: string (nullable = true)
 |-- PRODUCT_ID: integer (nullable = true)
 |-- MRP: integer (nullable = true)
 |-- CP: float (nullable = true)
 |-- DISCOUNT: float (nullable = true)
 |-- SP: float (nullable = true)
 |-- Date: date (nullable = true)



In [None]:
## Q2

In [109]:
spark.sql("show databases;").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [116]:
spark.sql("create database if not exists test1;")

DataFrame[]

In [118]:
spark.sql("show databases;").show()

+---------+
|namespace|
+---------+
|  default|
|    test1|
+---------+



In [117]:
spark.sql("use test1;")

DataFrame[]

In [130]:
df2.write.format("orc") \
.mode("overwrite") \
.saveAsTable("test1.clean_transactions")

                                                                                

In [None]:
/user/hive/warehouse/test1.db/clean_transactions

In [131]:
spark.sql("select * from test1.clean_transactions").limit(5).toPandas()

Unnamed: 0,STORE_ID,STORE_LOCATION,PRODUCT_CATEGORY,PRODUCT_ID,MRP,CP,DISCOUNT,SP,Date
0,YR7220,New York,Electronics,12254943,31,20.77,1.86,29.139999,2019-11-26
1,YR7220,New York,Furniture,72619323,15,9.75,1.5,13.5,2019-11-26
2,YR7220,New York,Electronics,34161682,88,62.48,4.4,83.599998,2019-11-26
3,YR7220,New York,Kitchen,79411621,91,58.240002,3.64,87.360001,2019-11-26
4,YR7220,New York,Fashion,39520263,85,51.0,2.55,82.449997,2019-11-26


In [133]:
! hdfs dfs -ls /user/hive/warehouse/test1.db/

Found 5 items
drwxr-xr-x   - train hive          0 2023-01-31 14:23 /user/hive/warehouse/test1.db/adv_lk
drwxr-xr-x   - train hive          0 2023-01-31 14:16 /user/hive/warehouse/test1.db/advertising
drwxr-xr-x   - train hive          0 2023-01-31 14:50 /user/hive/warehouse/test1.db/advertising_sales_gt_20
drwxr-xr-x   - train hive          0 2023-04-03 17:18 /user/hive/warehouse/test1.db/clean_transactions
drwxr-xr-x   - train hive          0 2023-01-31 14:11 /user/hive/warehouse/test1.db/mytable


In [None]:
# Q3

In [137]:
config = configparser.RawConfigParser()

config.read("./db_conn")
user_name = config.get('DB', 'user_name')
password = config.get('DB', 'password')
db_ip = config.get('DB', 'db_ip')

In [139]:
df2.write.format("jdbc") \
   .mode("overwrite") \
   .option("driver", "org.postgresql.Driver") \
   .option("url", f"jdbc:postgresql://{db_ip}:5432/traindb") \
   .option("user", user_name) \
   .option("password", password) \
   .option("dbtable", "clean_transactions") \
   .save()

                                                                                

In [143]:
spark.read.format("jdbc") \
     .option("driver", "org.postgresql.Driver") \
     .option("url", f"jdbc:postgresql://{db_ip}:5432/traindb") \
     .option("user", user_name) \
     .option("password", password) \
     .option("dbtable", "clean_transactions") \
     .load().limit(10).toPandas()

                                                                                

Unnamed: 0,STORE_ID,STORE_LOCATION,PRODUCT_CATEGORY,PRODUCT_ID,MRP,CP,DISCOUNT,SP,Date
0,YR7220,New York,Electronics,12254943,31,20.77,1.86,29.139999,2019-11-26
1,YR7220,New York,Furniture,72619323,15,9.75,1.5,13.5,2019-11-26
2,YR7220,New York,Electronics,34161682,88,62.48,4.4,83.599998,2019-11-26
3,YR7220,New York,Kitchen,79411621,91,58.240002,3.64,87.360001,2019-11-26
4,YR7220,New York,Fashion,39520263,85,51.0,2.55,82.449997,2019-11-26
5,YR7220,New York,Kitchen,93809204,37,24.049999,0.74,36.259998,2019-11-26
6,YR7220,New York,Cosmetics,86610412,80,48.799999,6.4,73.599998,2019-11-26
7,YR7220,New York,Kitchen,52503356,71,42.599998,5.68,65.32,2019-11-26
8,YR7220,New York,Kitchen,77516479,92,56.119999,3.68,88.32,2019-11-26
9,YR7220,New York,Cosmetics,47334289,16,10.72,0.96,15.04,2019-11-26


In [None]:
# Q4

In [145]:
df2.write.format("parquet") \
   .mode("overwrite") \
   .option("compression", "snappy") \
   .save("hdfs://localhost:9000/user/train/spark_odev_transaction")

                                                                                

In [146]:
! hdfs dfs -ls /user/train/spark_odev_transaction

Found 2 items
-rw-r--r--   1 train supergroup          0 2023-04-03 18:08 /user/train/spark_odev_transaction/_SUCCESS
-rw-r--r--   1 train supergroup     229002 2023-04-03 18:08 /user/train/spark_odev_transaction/part-00000-d63ce226-90d2-4e2a-9a05-7e914fa0259b-c000.snappy.parquet
