<a href="https://colab.research.google.com/github/maheshbabu-r/BIG_DATA/blob/main/FaceBookPosts_Analysis_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz  # supress download output use -q

!tar -zxvf spark-3.1.2-bin-hadoop3.2.tgz | grep "something" 2>/dev/null #Suppress tar output ---| grep "something" 2>/dev/null--- add after file_name"

!pip -q install findspark

Collecting findspark
  Downloading findspark-1.4.2-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: findspark
Successfully installed findspark-1.4.2


In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext

# or import like this
from pyspark.sql.functions import *


sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
spark = SparkSession.builder.appName("Analysis of Facebook Posts").getOrCreate()
print(spark.sparkContext.appName)

pyspark-shell


In [3]:
!wget -q https://raw.githubusercontent.com/maheshbabu-r/BIG_DATA/main/Hadoop%20Datasets/dataset_Facebook_cos.csv

In [4]:
df=spark.read.csv(header=True,inferSchema=True,path="/content/dataset_Facebook_cos.csv")
df.printSchema()
df.show(5)

root
 |-- Page total likes: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Category: integer (nullable = true)
 |-- Post Month: integer (nullable = true)
 |-- Post Weekday: integer (nullable = true)
 |-- Post Hour: integer (nullable = true)
 |-- Paid: integer (nullable = true)
 |-- Lifetime Post Total Reach: integer (nullable = true)
 |-- Lifetime Post Total Impressions: integer (nullable = true)
 |-- Lifetime Engaged Users: integer (nullable = true)
 |-- Lifetime Post Consumers: integer (nullable = true)
 |-- Lifetime Post Consumptions: integer (nullable = true)
 |-- Lifetime Post Impressions by people who have liked your Page: integer (nullable = true)
 |-- Lifetime Post reach by people who like your Page: integer (nullable = true)
 |-- Lifetime People who have liked your Page and engaged with your post: integer (nullable = true)
 |-- comment: integer (nullable = true)
 |-- like: integer (nullable = true)
 |-- share: integer (nullable = true)
 |-- Total Interactio

In [5]:
# 1.	The total number of posts made
print("total number of posts : %a"%df.count())

total number of posts : 500


In [6]:
# 2.	The percentage of the growth or decline of the page, in terms of likes (subscriptions on the page),
#  from the first post to the latest post

present=df.collect()[0]["Page total likes"]
print("subscriptions at recent post : %a"%present)

past=df.collect()[-1]["Page total likes"]
print("subscriptions at first post : %a"%past)

growth=(present-past)*100/past

print("growth from first post  to recent post : %a"%growth)

subscriptions at recent post : 139441
subscriptions at first post : 81370
growth from first post  to recent post : 71.36659702593093


In [7]:
# 3.	Which month, on average, has the highest number of post interactions?
df.groupBy("Post Month").agg(round(mean("Total Interactions"),2).alias("avg_Interactions"))\
                                                    .orderBy(desc("avg_Interactions")).show()

+----------+----------------+
|Post Month|avg_Interactions|
+----------+----------------+
|         7|           328.5|
|         9|           278.5|
|         5|           256.3|
|         2|          242.04|
|         8|          225.38|
|         4|          217.52|
|        12|          201.34|
|        11|          185.76|
|        10|           182.9|
|         1|           160.6|
|         6|          157.71|
|         3|           97.06|
+----------+----------------+



In [8]:
# 4.	Which day of the week, on average, has the highest number of post interactions?
df.groupBy("Post Weekday").agg(round(mean("Total Interactions"),2).alias("avg_Interactions"))\
                                                    .orderBy(desc("avg_Interactions")).show()

+------------+----------------+
|Post Weekday|avg_Interactions|
+------------+----------------+
|           3|          287.77|
|           4|          260.53|
|           1|          237.03|
|           5|          205.31|
|           2|          200.45|
|           6|           162.8|
|           7|          153.59|
+------------+----------------+



In [9]:
# 5.Which hour of the day, on average, has the highest number of post interactions?
df.groupBy("Post Hour").agg(round(mean("Total Interactions"),2).alias("avg_Interactions"))\
                                                    .orderBy(desc("avg_Interactions")).show()

+---------+----------------+
|Post Hour|avg_Interactions|
+---------+----------------+
|        5|          684.31|
|       14|          307.15|
|       20|           280.0|
|       10|          250.91|
|       13|          245.02|
|        3|          228.59|
|        2|           191.1|
|        1|           181.0|
|       12|          179.48|
|        4|          168.09|
|       17|          157.33|
|        6|          157.13|
|        7|          148.46|
|       11|           146.2|
|       23|           135.0|
|        9|           133.1|
|       22|           125.0|
|        8|           90.25|
|       16|            84.0|
|       15|           62.67|
+---------+----------------+
only showing top 20 rows



In [10]:
# 6.	Determine if paid (promoted) posts have a higher correlation with a large number of post shares when compared to 
# the post shares of organic (non-promoted) posts. This is to determine the commercial viability of investing in 
# paid posts for promoting cosmetic products. Answer with either a Yes or a No, 
# and provide the methodology of how you reached your conclusion

paid=df.filter(col("Paid")==1).corr("share","Lifetime Post Impressions by people who have liked your Page","pearson")
print("correlation for Paid : %a"%paid)

organic=df.filter(col("Paid")==0).corr("share","Lifetime Post Impressions by people who have liked your Page","pearson")
print("correlation for Organic : %a"%organic)


correlation for Paid : 0.4366305485653235
correlation for Organic : 0.2400598323464855


In [11]:
# 7.	Which post type (photo, video, status, or link) is the most attractive to people who have
# subscribed to your page (people who have liked the page)?
df.groupBy("Type").agg(sum("Lifetime People who have liked your Page and engaged with your post").alias("Liked"))\
                                                    .orderBy(desc("Liked")).show()


+------+------+
|  Type| Liked|
+------+------+
| Photo|216112|
|Status| 77393|
| Video|  6856|
|  Link|  4632|
+------+------+



In [12]:
# 8.	Which hour of the day is ideal for posting photographic content? Arrange the hours of 
# the day according to the order of the Lifetime Post Impressions column?
df.filter(col("Type")=='Photo').groupBy("Post Hour").agg(round(mean("Lifetime Post Total Impressions"),2).alias("avg_Impressions"))\
                                                    .orderBy(desc("avg_Impressions")).show(24)

+---------+---------------+
|Post Hour|avg_Impressions|
+---------+---------------+
|        7|      111260.45|
|       13|       48764.96|
|       14|       46405.18|
|        5|       40636.08|
|        2|       39888.63|
|       10|       33738.12|
|       12|       25247.67|
|       22|        24112.0|
|        3|       22784.06|
|        4|       19796.96|
|        6|       19738.33|
|       17|        15683.0|
|        9|        11118.0|
|       20|         9970.0|
|       15|         9678.8|
|       16|         9238.0|
|       11|        8649.12|
|        1|        6632.75|
|       18|        6195.67|
|        8|        5427.58|
|       23|         5058.0|
|       19|          570.0|
+---------+---------------+



In [13]:
# 9.	Create an additional column with the name Likes-to-comment Ratio, 
# with the column values having the equation:
df.withColumn("Likes-to-comment Ratio",(round((df.like/df.comment),2))).show(5)

+----------------+------+--------+----------+------------+---------+----+-------------------------+-------------------------------+----------------------+-----------------------+--------------------------+------------------------------------------------------------+------------------------------------------------+-------------------------------------------------------------------+-------+----+-----+------------------+----------------------+
|Page total likes|  Type|Category|Post Month|Post Weekday|Post Hour|Paid|Lifetime Post Total Reach|Lifetime Post Total Impressions|Lifetime Engaged Users|Lifetime Post Consumers|Lifetime Post Consumptions|Lifetime Post Impressions by people who have liked your Page|Lifetime Post reach by people who like your Page|Lifetime People who have liked your Page and engaged with your post|comment|like|share|Total Interactions|Likes-to-comment Ratio|
+----------------+------+--------+----------+------------+---------+----+-------------------------+-----------

In [14]:
# 10.	Arrange post categories (1,2,3) in the descending order of the reach that they can accumulate on average
df.groupBy("Category").agg(round(mean("Lifetime Post Total Reach"),2).alias("avg_reach"))\
                                                    .orderBy(desc("avg_reach")).show()

+--------+---------+
|Category|avg_reach|
+--------+---------+
|       1| 18320.98|
|       3| 11162.15|
|       2|  9865.65|
+--------+---------+



In [15]:
# 11.	Determine the standard deviation of the average post reach for each of the day hours. 
# This is to determine if the time of the day is an ideal criterion to identify when to create posts
df.groupBy("Post Weekday","Post Hour").agg(round(stddev("Lifetime Post Total Reach"),2).alias("stddev_reach"))\
                                                    .orderBy(desc("stddev_reach")).show(7*24)


+------------+---------+------------+
|Post Weekday|Post Hour|stddev_reach|
+------------+---------+------------+
|           3|        5|   124546.96|
|           1|       14|    71689.31|
|           7|        2|    59325.91|
|           7|       14|    55508.16|
|           6|       13|    54104.77|
|           2|        4|    44556.21|
|           2|       13|    39102.89|
|           5|        9|    38468.53|
|           5|       13|    33614.37|
|           1|        2|    33410.99|
|           6|       10|    31941.17|
|           2|        3|    30660.76|
|           2|        6|    30108.81|
|           4|       10|     29408.5|
|           6|       12|    26002.98|
|           5|        3|    25241.08|
|           4|        7|     25075.3|
|           2|       10|    24740.35|
|           3|        3|     24109.5|
|           3|       10|    23685.78|
|           2|       12|    23681.55|
|           4|        5|    21693.55|
|           4|        3|    21292.99|
|           

In [16]:
# 12.Is there any correlation between the number of post consumptions and the total interactions on the post? 
cor_rel=df.corr("Lifetime Post Consumptions","Total Interactions","pearson")
print("positive relationship : %a" %cor_rel)

positive relationship : 0.2380821957904635
