### PySpark Project


In [1]:
#configuring spark
import findspark
findspark.init()
findspark.find()

'C:\\Program Files\\spark\\spark-3.4.1-bin-hadoop3'

In [2]:
from pyspark.sql import SparkSession
#starting application with two worker nodes
spark=SparkSession.builder.appName('FbPostAnalysis').master('local[2]').getOrCreate()
spark

In [3]:
df=spark\
    .read\
    .option('mode','FAILFAST')\
    .csv('../data/dataset_Facebook_cos.csv',header=True,inferSchema=True)
df.show()

+----------------+------+--------+----------+------------+---------+----+-------------------------+-------------------------------+----------------------+-----------------------+--------------------------+------------------------------------------------------------+------------------------------------------------+-------------------------------------------------------------------+-------+----+-----+------------------+
|Page total likes|  Type|Category|Post Month|Post Weekday|Post Hour|Paid|Lifetime Post Total Reach|Lifetime Post Total Impressions|Lifetime Engaged Users|Lifetime Post Consumers|Lifetime Post Consumptions|Lifetime Post Impressions by people who have liked your Page|Lifetime Post reach by people who like your Page|Lifetime People who have liked your Page and engaged with your post|comment|like|share|Total Interactions|
+----------------+------+--------+----------+------------+---------+----+-------------------------+-------------------------------+----------------------+--

In [4]:
df.printSchema()

root
 |-- Page total likes: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Category: integer (nullable = true)
 |-- Post Month: integer (nullable = true)
 |-- Post Weekday: integer (nullable = true)
 |-- Post Hour: integer (nullable = true)
 |-- Paid: integer (nullable = true)
 |-- Lifetime Post Total Reach: integer (nullable = true)
 |-- Lifetime Post Total Impressions: integer (nullable = true)
 |-- Lifetime Engaged Users: integer (nullable = true)
 |-- Lifetime Post Consumers: integer (nullable = true)
 |-- Lifetime Post Consumptions: integer (nullable = true)
 |-- Lifetime Post Impressions by people who have liked your Page: integer (nullable = true)
 |-- Lifetime Post reach by people who like your Page: integer (nullable = true)
 |-- Lifetime People who have liked your Page and engaged with your post: integer (nullable = true)
 |-- comment: integer (nullable = true)
 |-- like: integer (nullable = true)
 |-- share: integer (nullable = true)
 |-- Total Interactio

In [5]:
# 1. The total number of posts made
df.count()

500

In [72]:
# 2. The percentage of the growth or decline of the page, in terms of likes
# (subscriptions on the page), from the first post to the latest post
import pyspark.sql.functions as f
first_post_likes=df.select('Page total likes').orderBy('Post Month','Post Weekday','Post Hour').first()['Page total likes']
last_post_likes=df.select('Page total likes').orderBy('Post Month','Post Weekday','Post Hour',ascending=False).first()['Page total likes']

# type(first_post_likes)
growth_percentage=f'{round((last_post_likes-first_post_likes)/last_post_likes*100)}%'
growth_percentage

'38%'

In [7]:
# 3. Which month, on average, has the highest number of post interactions?
df.groupBy('Post Month').agg(f.avg('Total Interactions').alias('Average Monthly Interaction')).orderBy(f.desc('Average Monthly Interaction')).show(3)

+----------+---------------------------+
|Post Month|Average Monthly Interaction|
+----------+---------------------------+
|         7|                      328.5|
|         9|                      278.5|
|         5|          256.2972972972973|
+----------+---------------------------+
only showing top 3 rows



In [8]:
# 5. Which hour of the day, on average, has the highest number of post interactions?
df.groupBy('Post Hour').agg(f.avg('Total Interactions').alias('Average Hourly Interaction')).orderBy(f.desc('Average Hourly Interaction')).show(3)

+---------+--------------------------+
|Post Hour|Average Hourly Interaction|
+---------+--------------------------+
|        5|         684.3076923076923|
|       14|        307.15384615384613|
|       20|                     280.0|
+---------+--------------------------+
only showing top 3 rows



In [9]:
# 6. Determine if paid (promoted) posts have a higher correlation with a large number
# of post shares when compared to the post shares of organic (non-promoted) posts.
# This is to determine the commercial viability of investing in paid posts for
# promoting cosmetic products. Answer with either a Yes or a No, and provide the
# methodology of how you reached your conclusion
df.groupBy('Paid').agg(
    f.avg('Lifetime Post Total Reach').alias('Total Lifetime Reach'),
    f.avg('Lifetime Post Consumers').alias('Total Lifetime Consumers'),
    f.avg('Lifetime Engaged Users').alias('Total Lifetime Engaged'),
    f.avg('share').alias('share'),
).show()

# df.where(df['Paid']==1).corr('Paid','share')
'''
Ans: Yes, Promoted posts have more shares than non-promoted shares.
'''

+----+--------------------+------------------------+----------------------+------------------+
|Paid|Total Lifetime Reach|Total Lifetime Consumers|Total Lifetime Engaged|             share|
+----+--------------------+------------------------+----------------------+------------------+
|null|              4188.0|                   524.0|                 564.0|              28.0|
|   1|  19288.158273381294|       938.0359712230215|     1106.525179856115| 32.50359712230216|
|   0|  11851.216666666667|       745.7638888888889|     849.4472222222222|25.219101123595507|
+----+--------------------+------------------------+----------------------+------------------+



'\nAns: Yes, Promoted posts have more shares than non-promoted shares.\n'

In [10]:
# 7. Which post type (photo, video, status, or link) is the most attractive to people who
# have subscribed to your page (people who have liked the page)?
df.groupBy('Type')\
    .agg(
    f.sum('Lifetime Post reach by people who like your Page').alias('Number of people who liked page')
)\
    .orderBy(f.desc('Number of people who liked page'))\
    .show()

+------+-------------------------------+
|  Type|Number of people who liked page|
+------+-------------------------------+
| Photo|                        2581178|
|Status|                         445886|
|  Link|                         143976|
| Video|                         121704|
+------+-------------------------------+



In [75]:
# 8. Which hour of the day is ideal for posting photographic content? Arrange the
# hours of the day according to the order of the Lifetime Post Impressions column?
df.groupBy('Post Hour')\
    .agg(
    f.avg('Lifetime Post Total Impressions').alias('Average Impressions')
)\
    .orderBy(f.desc('Average Impressions'))\
    .show()

+---------+-------------------+
|Post Hour|Average Impressions|
+---------+-------------------+
|        7|  98918.15384615384|
|       13| 52173.403846153844|
|       14|  41605.38461538462|
|        5|  40636.07692307692|
|        2| 35104.230769230766|
|       10|  32398.46153846154|
|        3| 26657.219047619048|
|       12| 25351.724137931036|
|        6|         24706.3125|
|       22|            24112.0|
|        4| 19454.428571428572|
|       17|            15683.0|
|       11| 12825.386363636364|
|        9| 12820.233333333334|
|       15|            10525.5|
|       20|             9970.0|
|       16|             9238.0|
|        1|            6632.75|
|       18|  6195.666666666667|
|        8|  5427.583333333333|
+---------+-------------------+
only showing top 20 rows



In [12]:
# 9. Create an additional column with the name Likes-to-comment Ratio, with the
# column values having the equation: likes to comment ratio = like/comment
# Hint: Make sure the ratio is in a decimal format, and correct it to 2 decimal places

df.select("*",f.round(f.col('like')/f.col('comment'),2).alias('Likes-to-comment Ratio')).show()

+----------------+------+--------+----------+------------+---------+----+-------------------------+-------------------------------+----------------------+-----------------------+--------------------------+------------------------------------------------------------+------------------------------------------------+-------------------------------------------------------------------+-------+----+-----+------------------+----------------------+
|Page total likes|  Type|Category|Post Month|Post Weekday|Post Hour|Paid|Lifetime Post Total Reach|Lifetime Post Total Impressions|Lifetime Engaged Users|Lifetime Post Consumers|Lifetime Post Consumptions|Lifetime Post Impressions by people who have liked your Page|Lifetime Post reach by people who like your Page|Lifetime People who have liked your Page and engaged with your post|comment|like|share|Total Interactions|Likes-to-comment Ratio|
+----------------+------+--------+----------+------------+---------+----+-------------------------+-----------

In [13]:
# 10. Arrange post categories (1,2,3) in the descending order of the reach that they
# can accumulate on average
df.groupBy('Type').agg(
   f.ceil(f.avg('Lifetime Post Total Reach')).alias('Average Reach')
)\
    .orderBy(f.desc('Average Reach'))\
    .show()

+------+-------------+
|  Type|Average Reach|
+------+-------------+
| Video|        51206|
|  Link|        18545|
| Photo|        13138|
|Status|        13079|
+------+-------------+



In [14]:
# 11. Determine the standard deviation of the average post reach for each of the day
# hours. This is to determine if the time of the day is an ideal criterion to identify
# when to create posts
df.groupBy('Post hour').agg(
    f.stddev(f.col('Lifetime Post Total Reach'))
).show()

+---------+--------------------------------------+
|Post hour|stddev_samp(Lifetime Post Total Reach)|
+---------+--------------------------------------+
|       12|                    16929.348611911955|
|       22|                                  null|
|        1|                    1668.8739467077794|
|       13|                    31605.113444715214|
|        6|                     19384.93338953391|
|       16|                                  null|
|        3|                     20062.48506691216|
|       20|                                  null|
|        5|                     48900.71604032409|
|       19|                                  null|
|       15|                     1875.010826635409|
|        9|                    12813.220059900956|
|       17|                    6172.8230170643965|
|        4|                    16179.952578173177|
|        8|                      2586.13118676364|
|       23|                                  null|
|        7|                    

In [15]:
# 12. Is there any correlation between the number of post consumptions and the
# total interactions on the post?
df.corr('Lifetime Post Consumptions','Total Interactions')
'''
Ans: There is very weak relationship.
'''

'\nAns: There is very weak relationship.\n'

In [78]:
# 13. Determine the two best days in a week to create posts, when people are
# extremely active on social media, based on the data that you have
# Hint: Question 13 can have a subjective answer. You are free to choose your own
# approach to determine the best days to post in a week. Make sure to validate your
# claims with the relevant code and explanation of your approach.
df.groupBy('Post Weekday').agg(
    f.count('Post Hour').alias('Average Hours'),
    f.avg('Total Interactions').alias('Average Interaction'),
    f.avg('comment').alias('Average Comment'),
    f.avg('like').alias('Average like'),
    f.avg('share').alias('Average share'),
).orderBy(f.desc('Average Hours'))\
    .show()
'''
Ans: According to results Monday and Thursday are two days best for social media posts interactions.
'''

+------------+-------------+-------------------+------------------+------------------+------------------+
|Post Weekday|Average Hours|Average Interaction|   Average Comment|      Average like|     Average share|
+------------+-------------+-------------------+------------------+------------------+------------------+
|           7|           82| 153.58536585365854| 4.012195121951219|128.78048780487805|           21.3125|
|           6|           81| 162.80246913580248| 5.296296296296297|          135.9375|           23.5375|
|           4|           72| 260.52777777777777|10.208333333333334|            218.75|31.569444444444443|
|           1|           68| 237.02941176470588| 7.926470588235294|204.33823529411765|24.764705882352942|
|           5|           67|  205.3134328358209|  6.08955223880597| 172.1492537313433|27.484848484848484|
|           2|           66| 200.45454545454547|  6.53030303030303|167.21212121212122| 26.71212121212121|
|           3|           64|         287.76562

'\nAns: According to results Monday and Thursday are two days best for social media posts interactions.\n'