In [43]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark= SparkSession. \
builder. \
appName("week_9"). \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/itv009033/warehouse"). \
config("spark.sql.files.maxPartitionBytes", "146800640"). \
config("spark.shuffle.useOldFetchProtocol", "true").\
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
#creating schema for reading the json file
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

In [3]:
users_schema = StructType([StructField("user_id", IntegerType(), nullable=False),
StructField("user_first_name", StringType(), nullable=False),
StructField("user_last_name", StringType(), nullable=False),
StructField("user_email", StringType(), nullable=False),
StructField("user_gender", StringType(), nullable=False),
StructField("user_phone_numbers", ArrayType(StringType()),
nullable=True),
StructField("user_address", StructType([
StructField("street", StringType(), nullable=False),
StructField("city", StringType(), nullable=False),
StructField("state", StringType(), nullable=False),
StructField("postal_code", StringType(), nullable=False),
]), nullable=False)])

In [4]:
df = spark.read.format("json").schema(users_schema).load("/public/sms/users/*")

In [5]:
df.show(truncate = False)

+-------+---------------+--------------+--------------------------------+-----------+------------------------------------------------------------+----------------------------------------------------------+
|user_id|user_first_name|user_last_name|user_email                      |user_gender|user_phone_numbers                                          |user_address                                              |
+-------+---------------+--------------+--------------------------------+-----------+------------------------------------------------------------+----------------------------------------------------------+
|200001 |Eirena         |Cutsforth     |ecutsforth0@wisc.edu            |Female     |[4197404036, 9173828061, 4027467939, 3371963735]            |{8 Warrior Drive, Dallas, Texas, 75358}                   |
|200002 |Marja          |Shopcott      |mshopcott1@hexun.com            |Female     |[9542037028, 2128289053, 4068088779, 2621495368, 9038414778]|{66 Prairieview Terrace, Jolie

In [6]:
#number of partitions
df.rdd.getNumPartitions()

3

# Question 2
#### Do some basic analysis and find out the following

a. total number of records in the dataframe

In [7]:
print("Total records in the dataframe : ", df.count())

Total records in the dataframe :  1000000


b. how many users are from the state New York

In [8]:
df.select("user_address").show(truncate = False)

+----------------------------------------------------------+
|user_address                                              |
+----------------------------------------------------------+
|{8 Warrior Drive, Dallas, Texas, 75358}                   |
|{66 Prairieview Terrace, Joliet, Illinois, 60435}         |
|{18 Ronald Regan Hill, Shawnee Mission, Kansas, 66225}    |
|{7696 Calypso Junction, Saint Paul, Minnesota, 55166}     |
|{942 Emmet Park, Albuquerque, New Mexico, 87110}          |
|{664 Ridge Oak Circle, Virginia Beach, Virginia, 23454}   |
|{4 Continental Drive, Young America, Minnesota, 55557}    |
|{44698 Arapahoe Park, Tucson, Arizona, 85725}             |
|{null, null, null, null}                                  |
|{9430 Reinke Park, North Las Vegas, Nevada, 89087}        |
|{688 Granby Point, Warren, Ohio, 44485}                   |
|{3997 Russell Parkway, North Little Rock, Arkansas, 72199}|
|{89 Fair Oaks Terrace, Shawnee Mission, Kansas, 66220}    |
|{3 Green Pass, Fort Way

In [9]:
from pyspark.sql.functions import col,size

# Unnesting Json file

In [10]:
df_unnest = df.withColumn("user_street",col("user_Address.street")) \
.withColumn("user_city",col("user_address.city")) \
.withColumn("user_state", col("user_Address.state")) \
.withColumn("user_postal_code", col("user_address.postal_code")) \
.withColumn("num_phn_numbers",
size(col("user_phone_numbers")))

In [11]:
df_unnest.show(truncate = False)

+-------+---------------+--------------+--------------------------------+-----------+------------------------------------------------------------+----------------------------------------------------------+----------------------+-----------------+----------+----------------+---------------+
|user_id|user_first_name|user_last_name|user_email                      |user_gender|user_phone_numbers                                          |user_address                                              |user_street           |user_city        |user_state|user_postal_code|num_phn_numbers|
+-------+---------------+--------------+--------------------------------+-----------+------------------------------------------------------------+----------------------------------------------------------+----------------------+-----------------+----------+----------------+---------------+
|200001 |Eirena         |Cutsforth     |ecutsforth0@wisc.edu            |Female     |[4197404036, 9173828061, 4027467939, 33719

In [12]:
df_state = df_unnest.select("user_state")
df_state.filter(df_state.user_state == "New York").count()

49576

# c. which state has maximum number of postal codes

In [14]:
from pyspark.sql.functions import countDistinct

In [15]:
df_postal = df_unnest.select("user_state", "user_postal_code")

temp = df_postal.groupBy("user_state").agg(countDistinct("user_postal_code")).sort("count(user_postal_code)",ascending = False)
temp = temp.withColumnRenamed("count(user_postal_code)", "zip_codes")
temp.show()

+--------------------+---------+
|          user_state|zip_codes|
+--------------------+---------+
|          California|      206|
|               Texas|      205|
|             Florida|      155|
|            New York|      104|
|                Ohio|       68|
|            Virginia|       66|
|District of Columbia|       60|
|        Pennsylvania|       60|
|             Georgia|       54|
|            Illinois|       47|
|      North Carolina|       46|
|            Colorado|       43|
|             Indiana|       40|
|             Arizona|       40|
|            Missouri|       40|
|             Alabama|       39|
|           Minnesota|       39|
|           Tennessee|       37|
|          Washington|       37|
|           Louisiana|       36|
+--------------------+---------+
only showing top 20 rows



# d. which city has the most number of users

In [16]:
users = df_unnest.select("user_id", "user_city")
userrs = users.groupBy("user_city").agg(countDistinct("user_id")).sort("count(user_id)", ascending = False)
userrs = userrs.withColumnRenamed("count(user_id)", "no_of_people_per_state")
userrs = userrs.filter(userrs.user_city != "null")
userrs.show()

+-------------+----------------------+
|    user_city|no_of_people_per_state|
+-------------+----------------------+
|   Washington|                 28504|
|      Houston|                 18098|
|New York City|                 15546|
|      El Paso|                 14740|
|       Dallas|                 11842|
|      Atlanta|                 11634|
|   Sacramento|                  9983|
|  Los Angeles|                  9679|
|  Kansas City|                  9020|
|        Miami|                  9016|
|      Chicago|                  8553|
|  Springfield|                  8101|
| Philadelphia|                  8091|
|  San Antonio|                  8026|
|      Phoenix|                  7652|
|Oklahoma City|                  7609|
|       Austin|                  7574|
|    Charlotte|                  7550|
|    San Diego|                  7396|
|     Richmond|                  7214|
+-------------+----------------------+
only showing top 20 rows



# e. how many users have email domain as bizjournals.com

In [17]:
email = df_unnest.select("user_id", "user_email").filter(df_unnest.user_email.like("%bizjournals.com%"))
email.show(truncate = False)

+-------+-----------------------------+
|user_id|user_email                   |
+-------+-----------------------------+
|200385 |crowntreeao@bizjournals.com  |
|200490 |kgallymoredl@bizjournals.com |
|200524 |okegginsej@bizjournals.com   |
|201220 |banchorxv@bizjournals.com    |
|201904 |mfraczak1gv@bizjournals.com  |
|203138 |ohaskur2f5@bizjournals.com   |
|203271 |drae2iu@bizjournals.com      |
|203517 |mtwallin2po@bizjournals.com  |
|203577 |dkendrew2rc@bizjournals.com  |
|203700 |jmerriment2ur@bizjournals.com|
|203806 |hsherrum2xp@bizjournals.com  |
|205841 |xpelfer4i8@bizjournals.com   |
|205912 |ccowden4k7@bizjournals.com   |
|208610 |hrackstraw6n5@bizjournals.com|
|209004 |rsodor6y3@bizjournals.com    |
|209440 |ktownson7a7@bizjournals.com  |
|209602 |tlenoury7ep@bizjournals.com  |
|210206 |zjohnstone7vh@bizjournals.com|
|210341 |kaskell7z8@bizjournals.com   |
|210399 |jstolli80u@bizjournals.com   |
+-------+-----------------------------+
only showing top 20 rows



In [18]:
email.count()

2015

# f. how many users have 4 phone numbers mentioned

In [19]:
ph_num = df_unnest.select("user_id", "num_phn_numbers").filter(df_unnest.num_phn_numbers==4)
ph_num.count()

179041

# g. how many users do not have any phone number mentioned

In [20]:
ph_num = df_unnest.select("user_id", "num_phn_numbers").filter(df_unnest.num_phn_numbers==-1)
ph_num.count()

108981

# Question 3
## Write the data from the base dataframe as it is to the disk, but write in parquet format. Observe the number of files created, also the size of files

In [21]:
# customers_folder_write

In [None]:
# df_unnest.write.format("parquet").mode("overwrite").partitionBy("user_state", "user_city").option("path", "/user/itv009033/customers_folder_write").save()

# Question 5
* Create a pivot where states should be the rows and user_gender should be  the columns.

In [22]:
df_pvt = df_unnest.select("user_gender", "user_state").filter(df_unnest.num_phn_numbers>0)

In [23]:
df_pvt.show()

+-----------+----------+
|user_gender|user_state|
+-----------+----------+
|     Female|     Texas|
|     Female|  Illinois|
|     Female|    Kansas|
|     Female| Minnesota|
|       Male|New Mexico|
|     Female|  Virginia|
|       Male| Minnesota|
|       Male|   Arizona|
|       Male|    Nevada|
|       Male|      Ohio|
|     Female|  Arkansas|
|     Female|    Kansas|
|     Female|   Indiana|
|     Female|California|
|     Female|   Florida|
|     Female|   Montana|
|     Female|     Texas|
|       Male|  Missouri|
|     Female|California|
|       Male| Minnesota|
+-----------+----------+
only showing top 20 rows



In [24]:
df_count = df_pvt.groupBy("user_state", "user_gender").count().sort("count", ascending = False)
df_count = df_count.withColumnRenamed("count", "MF")
df_count.show()

+--------------------+-----------+-----+
|          user_state|user_gender|   MF|
+--------------------+-----------+-----+
|          California|       Male|49120|
|               Texas|       Male|48786|
|          California|     Female|48716|
|               Texas|     Female|48450|
|             Florida|       Male|36692|
|             Florida|     Female|36688|
|            New York|       Male|25078|
|            New York|     Female|24498|
|                Ohio|       Male|16322|
|                Ohio|     Female|16239|
|            Virginia|       Male|15849|
|            Virginia|     Female|15607|
|District of Columbia|     Female|14292|
|        Pennsylvania|       Male|14270|
|        Pennsylvania|     Female|14237|
|District of Columbia|       Male|14212|
|             Georgia|     Female|13028|
|             Georgia|       Male|13008|
|            Illinois|     Female|11267|
|            Illinois|       Male|11178|
+--------------------+-----------+-----+
only showing top

In [25]:
df_count.printSchema()

root
 |-- user_state: string (nullable = true)
 |-- user_gender: string (nullable = true)
 |-- MF: long (nullable = false)



In [26]:
result = df_count.groupBy("user_state").pivot("user_gender").sum("MF")
result.show()

+--------------------+------+-----+
|          user_state|Female| Male|
+--------------------+------+-----+
|                Utah|  4171| 4073|
|              Hawaii|  2062| 2172|
|           Minnesota|  9250| 9371|
|                Ohio| 16239|16322|
|            Arkansas|  2416| 2420|
|              Oregon|  3841| 3899|
|               Texas| 48450|48786|
|        North Dakota|   940|  981|
|        Pennsylvania| 14237|14270|
|         Connecticut|  5917| 5797|
|             Vermont|   237|  227|
|            Nebraska|  3688| 3501|
|              Nevada|  6495| 6317|
|          Washington|  8755| 8812|
|            Illinois| 11267|11178|
|            Oklahoma|  6913| 6888|
|District of Columbia| 14292|14212|
|            Delaware|  1654| 1651|
|              Alaska|  1938| 1882|
|          New Mexico|  2745| 2804|
+--------------------+------+-----+
only showing top 20 rows



In [30]:
result.write.format("csv").mode("overwrite").partitionBy("user_state").option("path", "/user/itv009033/pivot_assignment_result").save()

In [44]:
new_df = spark.read.format("csv").load("/public/airlines_all/airlines")

In [45]:
new_df.show()

+----+---+---+---+----+----+----+----+---+----+------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
| _c0|_c1|_c2|_c3| _c4| _c5| _c6| _c7|_c8| _c9|  _c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|
+----+---+---+---+----+----+----+----+---+----+------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|2001|  8|  2|  4|  NA|1047|  NA|1222| AA|1056|�NKNO�|  NA|  95|  NA|  NA|  NA| MCI| ORD| 403|   0|   0|   1|  NA|   0|  NA|  NA|  NA|  NA|  NA| YES| YES|
|2001|  8|  3|  5|1048|1047|1210|1222| AA|1056|N274A1|  82|  95|  66| -12|   1| MCI| ORD| 403|   6|  10|   0|  NA|   0|  NA|  NA|  NA|  NA|  NA|  NO| YES|
|2001|  8|  4|  6|1043|1047|1159|1222| AA|1056|N513A1|  76|  95|  61| -23|  -4| MCI| ORD| 403|   4|  11|   0|  NA|   0|  NA|  NA|  NA|  NA|  NA|  NO|  NO|
|2001|  8|  5|  7|1043|1047|1203|1222| AA|1056|N532A1|  80|  95|  65| 

In [46]:
new_df.rdd.getNumPartitions()

960