In [1]:
# Set the PySpark environment variables
import os
os.environ['SPARK_HOME'] = r"C:\spark\spark-3.4.2-bin-hadoop3"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [2]:
# Import PySpark
from pyspark.sql import SparkSession

In [3]:
# Create a Spark session
spark = SparkSession.builder.appName("social_media_Processing").getOrCreate()

In [4]:
#define dataset_path
dataset_path = "data/Average Time Spent By A User On Social Media.csv"

# Read the dataset into a DataFrame
df = spark.read.csv(dataset_path, header=True, inferSchema=True)

In [5]:
# Show the initial structure of the DataFrame
df.printSchema()

# Display the first few rows of the DataFrame
df.show(5, truncate=False)

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- time_spent: integer (nullable = true)
 |-- platform: string (nullable = true)
 |-- interests: string (nullable = true)
 |-- location: string (nullable = true)
 |-- demographics: string (nullable = true)
 |-- profession: string (nullable = true)
 |-- income: integer (nullable = true)
 |-- indebt: boolean (nullable = true)
 |-- isHomeOwner: boolean (nullable = true)
 |-- Owns_Car: boolean (nullable = true)

+---+----------+----------+---------+---------+--------------+------------+-----------------+------+------+-----------+--------+
|age|gender    |time_spent|platform |interests|location      |demographics|profession       |income|indebt|isHomeOwner|Owns_Car|
+---+----------+----------+---------+---------+--------------+------------+-----------------+------+------+-----------+--------+
|56 |male      |3         |Instagram|Sports   |United Kingdom|Urban       |Software Engineer|19774 |true  |false      |f

In [6]:
from pyspark.sql.functions import sum

In [7]:
# Group by social media platform and calculate the total time spent
total_time_spent_df = df.groupBy("platform") \
    .agg(sum("time_spent").alias("total_hours_spent")) \
    .orderBy("total_hours_spent", ascending=False)  # Sort in descending order


In [8]:
# Show the result
total_time_spent_df.show()

+---------+-----------------+
| platform|total_hours_spent|
+---------+-----------------+
|Instagram|             1870|
|  YouTube|             1607|
| Facebook|             1552|
+---------+-----------------+



In [9]:
from pyspark.sql.functions import count

In [10]:
# Group by user's interest & platform and calculate the total number of users
number_of_users_per_interest = df.groupBy("platform","interests") \
    .agg(count("interests").alias("no_of_users")) \
    .orderBy("interests", "no_of_users", ascending=False)  # Sort in descending order

In [11]:
number_of_users_per_interest.show()

+---------+---------+-----------+
| platform|interests|no_of_users|
+---------+---------+-----------+
|  YouTube|   Travel|        124|
|Instagram|   Travel|        108|
| Facebook|   Travel|         96|
|Instagram|   Sports|        125|
| Facebook|   Sports|        119|
|  YouTube|   Sports|         87|
|Instagram|Lifestlye|        130|
|  YouTube|Lifestlye|        119|
| Facebook|Lifestlye|         92|
+---------+---------+-----------+



In [12]:
# Group by demographics and calculate the number of users
number_of_users_per_demographics = df.groupBy("demographics") \
    .agg(count("demographics").alias("no_of_users")) \
    .orderBy("no_of_users", ascending=False)  # Sort in descending order

In [13]:
number_of_users_per_demographics.show()

+------------+-----------+
|demographics|no_of_users|
+------------+-----------+
|       Rural|        340|
|   Sub_Urban|        335|
|       Urban|        325|
+------------+-----------+



In [14]:
#ending session
spark.stop()