In [1]:
#!pip3 install pandas
#!pip3 install PyArrow
from pyspark.sql.functions import col

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType

In [3]:
import os
import time
import json
import requests
import xml.etree.ElementTree as ET
import datetime
import subprocess

In [4]:
if os.path.exists("/etc/hadoop/conf/hive-site.xml"):
    tree = ET.parse("/etc/hadoop/conf/hive-site.xml")
    root = tree.getroot()
    for prop in root.findall("property"):
        if prop.find("name").text == "hive.metastore.warehouse.dir":
            storage = (
                prop.find("value").text.split("/")[0]
                + "//"
                + prop.find("value").text.split("/")[2]
            )

In [5]:
os.environ["STORAGE"] = storage

In [6]:
print(storage)

s3a://go01-demo


In [30]:
spark = SparkSession\
    .builder\
    .appName("PythonSQL")\
    .config("spark.hadoop.fs.s3a.s3guard.ddb.region","us-east-2")\
    .config("spark.yarn.access.hadoopFileSystems",os.environ["STORAGE"])\
    .config("spark.rpc.message.maxSize", "1024")\
    .config("spark.dynamicAllocation.enabled", "true")\
    .getOrCreate()
 #   .config("spark.driver.cores", 4)\
 #   .config("spark.driver.memory", "8g")\
    

In [31]:
customers_df = spark.read.csv(os.environ["STORAGE"]+'/cde-workshop/clickthrough/customers/data', header=True)

                                                                                

In [9]:
customers_df.select("customer_id").show()

+-----------+
|customer_id|
+-----------+
|          1|
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
|         10|
|         11|
|         12|
|         13|
|         14|
|         15|
|         16|
|         17|
|         18|
|         19|
|         20|
+-----------+
only showing top 20 rows



In [10]:
!pip3 install faker



In [11]:
customers_df.dtypes

[('customer_id', 'string'),
 ('username', 'string'),
 ('name', 'string'),
 ('gender', 'string'),
 ('email', 'string'),
 ('occupation', 'string'),
 ('birthdate', 'string'),
 ('address', 'string')]

In [12]:
max_current_cust_id = int(customers_df.select(F.max("customer_id")).collect()[0]['max(customer_id)'])

In [13]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [14]:
from faker import Faker
fake = Faker(seed=3)

In [15]:
!pip3 install pandas
import pandas as pd



In [16]:
appends = []
for i in range(5):
    appends.append(fake.profile())

In [17]:
appends_df = pd.DataFrame(appends)[["username", "name", "sex", "mail", "job", "birthdate", "address"]]

In [18]:
import random
import numpy as np

In [19]:
def make_batch_df(max_current_cust_id):
    batch_size = random.randint(100, 1000)
    appends = []
    for i in range(batch_size):
        appends.append(fake.profile())
    appends_df = pd.DataFrame(appends)[["username", "name", "sex", "mail", "job", "birthdate", "address"]]
    appends_df['customer_id'] = list(range(max_current_cust_id, max_current_cust_id+batch_size))
    return appends_df

In [20]:
make_batch_df(9999)

Unnamed: 0,username,name,sex,mail,job,birthdate,address,customer_id
0,mavila,Rhonda Johnson,F,johnwilliams@yahoo.com,"Therapist, drama",2000-09-12,Unit 3313 Box 5140\nDPO AA 02879,9999
1,kevin36,Carmen Griffin,F,dalemullins@yahoo.com,Trade mark attorney,1946-04-15,"991 Lauren Fort\nMossville, WA 57733",10000
2,jpineda,Allen Torres,M,kevinjenkins@gmail.com,Water engineer,1972-09-09,"3400 Charles Plain Apt. 285\nMorganbury, SC 04316",10001
3,krystalellis,Barbara Mitchell,F,jamesrios@gmail.com,Soil scientist,1908-02-10,"2788 Castillo Fall\nRebeccaview, HI 20055",10002
4,hancockrichard,George Parker,M,imooney@yahoo.com,"Scientist, research (maths)",1996-03-03,"1549 Conway Valleys\nSouth Johnfort, AL 43825",10003
...,...,...,...,...,...,...,...,...
354,qrice,Robert Hernandez,M,brandon09@gmail.com,Waste management officer,1912-05-13,"2263 Tina Springs\nNew Melissa, IN 52637",10353
355,djones,Kristi Henderson,F,isabelhowe@gmail.com,Heritage manager,1911-02-26,"429 Desiree Stream\nNew Jessicashire, SD 68761",10354
356,williamsjustin,Jeffrey Anderson,M,jcordova@yahoo.com,Translator,1937-01-13,"43439 Hodge Motorway\nMillerstad, NC 70033",10355
357,nsilva,Jennifer Jackson,F,ifoley@hotmail.com,Dietitian,1918-05-27,"5876 Brittany Cove\nNguyenbury, DE 38422",10356


In [37]:
ct_hist_df = spark.read.option("header","true").parquet("s3a://go01-demo/cde-workshop/clickthrough/historical")

                                                                                

In [86]:
hist_ded_ids = ct_hist_df.select("device_id").sample(.01).toPandas()

                                                                                

In [87]:
unique_arr = np.unique(hist_ded_ids.device_id)

In [88]:
unique_arr

array(['000032d7', '000070cc', '00038618', ..., 'ffff60f9', 'ffff9249',
       'ffffe321'], dtype=object)

In [90]:
hist_ded_ids = unique_arr[0:10000]
len(hist_ded_ids)

10000

In [32]:
customers_pd_df = customers_df.toPandas()

In [46]:
hist_ded_ids.count()

device_id    10000
dtype: int64

In [95]:
#customers_pd_df = customers_pd_df.drop(["devide_id"])
customers_pd_df["device_id"] = hist_ded_ids

In [98]:
customers_pd_df = customers_pd_df.drop("devide_id", axis=1)

In [100]:
final_spark_df_customers = spark.createDataFrame(customers_pd_df)

In [101]:
final_spark_df_customers.write.mode("overwrite").csv(os.environ["STORAGE"]+'/cde-workshop/clickthrough/customers/data', header=True)

                                                                                

In [102]:
verify_df = spark.read.csv(os.environ["STORAGE"]+'/cde-workshop/clickthrough/customers/data', header=True)

                                                                                

In [103]:
verify_df.show()

+-----------+--------------+--------------------+------+--------------------+--------------------+----------+--------------------+---------+
|customer_id|      username|                name|gender|               email|          occupation| birthdate|             address|device_id|
+-----------+--------------+--------------------+------+--------------------+--------------------+----------+--------------------+---------+
|          1|       robin48|       Jesse Spencer|     M|   udalton@yahoo.com|Pharmacist, commu...|1975-09-24|10305 Scott River...| 000032d7|
|          2|cynthiajackson|     Savannah Daniel|     F|walkerchristopher...|        Set designer|1934-09-28|70884 Andrew Plaz...| 000070cc|
|          3|       ydurham|     Alexander Davis|     M|annlindsey@yahoo.com|Plant breeder/gen...|1975-11-09|0365 Carrie Point...| 00038618|
|          4| murphymichael|      Patrick Cortez|     M| freeves@hotmail.com|Scientist, audiol...|1911-01-20|9864 Brian Walk S...| 000727b1|
|          5|

In [104]:
hist_ded_ids = ct_hist_df.select("device_id").sample(.01).toPandas()

                                                                                

In [105]:
unique_arr2 = np.unique(hist_ded_ids.device_id)

In [107]:
sample_device_ids = [i for i in unique_arr2 if i not in unique_arr]

In [108]:
len(sample_device_ids)

29261

In [113]:
array_df = pd.DataFrame(sample_device_ids, columns=['device_id'])
device_id_spark_df = spark.createDataFrame(array_df)

In [114]:
device_id_spark_df.write.csv(os.environ["STORAGE"]+'/cde-workshop/clickthrough/customers/data/device_ids', header=True)

                                                                                

In [115]:
def make_batch_df(max_current_cust_id):
    batch_size = random.randint(100, 1000)
    appends = []
    for i in range(batch_size):
        appends.append(fake.profile())
    appends_df = pd.DataFrame(appends)[["username", "name", "sex", "mail", "job", "birthdate", "address"]]
    appends_df['customer_id'] = list(range(max_current_cust_id, max_current_cust_id+batch_size))
    return appends_df

In [121]:
new_batch_pd_df = make_batch_df(max_current_cust_id)
new_batch_pd_df_count = new_batch_pd_df.count()

In [122]:
new_batch_spark_df = spark.createDataFrame(new_batch_pd_df)

In [124]:
new_batch_pd_df_count[0]

304

In [125]:
in_device_id_spark_df = spark.read.csv(os.environ["STORAGE"]+'/cde-workshop/clickthrough/customers/data/device_ids', header=True)

                                                                                

In [127]:
in_device_id_spark_df.limit(5).show()

[Stage 20:>                                                         (0 + 1) / 1]

+---------+
|device_id|
+---------+
| 7e00e93d|
| 7e01296d|
| 7e033a05|
| 7e04addc|
| 7e062ffd|
+---------+



                                                                                

In [None]:
spark.createDataFrame(new_batch_pd_df)