In [1]:
sc

In [3]:
from pyspark.sql.types import * # StructField, StringType, ....
from pyspark.sql.functions import * # from_json, col

In [2]:
# JSON dictionaries
filename = "data_august.txt"
filename2 = "ab_test_lifestyle.csv" 
filename3 = "itemIds_582734.csv"

# standard CSV
filename4 = "ab_primary.csv"

# Read First Data
### 1. Read from json into Dataframe
### 2. Add and delete comlumns
### 3. Change one column from string type to array type and explode on that column

In [4]:
# read json
df = spark.read.json(filename)
df.printSchema()

root
 |-- attributes: struct (nullable = true)
 |    |-- item_id: string (nullable = true)
 |    |-- primary_image_url: string (nullable = true)
 |    |-- secondary_image_urls: string (nullable = true)
 |-- productId: string (nullable = true)
 |-- productType: string (nullable = true)
 |-- tagSource: string (nullable = true)



In [5]:
# add / delete columns
df = df.withColumn("itemId", df['attributes']['item_Id']).\
    withColumn("primary_image_url", df['attributes']['primary_image_url']).\
    withColumn("secondary_image_url", df['attributes']['secondary_image_urls']).\
    drop("attributes")

In [6]:
# convert secondary_image_url to array and expand
df = df.withColumn("secondary_image_url", split(regexp_replace(df["secondary_image_url"], "[\\[\\] ]", ""), ",") )
print(df.count(), " rows")
df = df.withColumn("secondary_image_url", explode(df["secondary_image_url"]))
df = df.withColumn("secondary_image_url", regexp_replace(df["secondary_image_url"],"[\"]", "" ))
print(df.count(), " rows")

582726  rows
1916234  rows


In [8]:
# display one example of secondary url
# df.select(df.secondary_image_url).take(1)

# Read Second Data
### 1. Read from json into Dataframe
### 2. Display how many Null values are there if any
### 3. Filter data by joining with another dataframe of ids

In [9]:
# read data
df2 = spark.read.json(filename2)
df2 = df2.withColumn("assetUrl", df2['assetBag']['assetUrl']).\
    withColumn("assetType", df2['assetBag']['assetType']).\
    drop("assetBag")
print(df2.count())

999419


In [10]:
print("Any none value?")
df2.select([count(when(isnan(c), c)).alias(c) for c in df2.columns]).show()

Any none value?
+------+---------+--------+---------+
|itemId|productId|assetUrl|assetType|
+------+---------+--------+---------+
|     0|        0|       0|        0|
+------+---------+--------+---------+



In [11]:
itemids = spark.read.text(filename3)
df2 = df2.alias('a').join(itemids.alias('b'),col('a.itemId') == col('b.value'),'inner')
df2.count()

582734

# Read Third Standard CSV data

In [13]:
# read csv data with header
df3 = spark.read.option("header",True).csv(filename4).\
    withColumnRenamed('assetType', 'assetType2')

# Join all three dataframes together 
### 1. Filter by columns matching
### 1. Save uniqu item ids to txt

In [14]:
df_joined = df.join(df2, ['productId', 'itemId'], how='left').\
    join(df3, ['productId', 'itemId'], how='left')

In [15]:
final= df_joined.filter(
    (df_joined['secondary_image_url'] == df_joined['assetUrl']) & \
    (df_joined['assetType'] == 'SECONDARY') & \
    (df_joined['primary_image_url'] == df_joined['url']) & \
    (df_joined['assetType2'] == 'PRIMARY')
)

In [16]:
final.select('itemId').toPandas().to_csv('unchanged_itmes.txt', index=False)