In [33]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [4]:
try:
    import pyspark
    print("pysprk is installed")
except ImportError as e:
    print("pyspark is not installed")

pysprk is installed


In [5]:
# Import necessary package for Spark Session
from pyspark.sql import SparkSession

In [6]:
# Import functions for DataFrame operations
from pyspark.sql import functions as F

In [7]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType, TimestampType

In [8]:
# Create a Spark Session
# SparkSession is the entry point for spark applications. It allows you to run SQL queries on database tables.
spark = SparkSession.builder.appName("VerifyInstallation").getOrCreate()

In [9]:
# To check whether spark is created.
print("Spark created successfully.", spark)

Spark created successfully. <pyspark.sql.session.SparkSession object at 0x000001E25D9F6720>


In [10]:
df = spark.read.csv("data.csv", header=True, inferSchema=True)

In [11]:
df.show()

+-------+------------------+
|user_id|transaction_amount|
+-------+------------------+
|      5|            545.92|
|     98|            386.07|
|     30|            577.02|
|     14|            479.49|
|     34|            834.78|
|     29|            531.64|
|     72|            275.41|
|     79|            272.48|
|     38|            365.55|
|     47|            902.76|
|      8|            816.28|
|     82|            610.81|
|     29|             68.13|
|     34|            568.34|
|     26|            367.73|
|     98|             118.9|
|     54|            179.38|
|     78|             31.64|
|     43|            743.28|
|     61|             667.6|
+-------+------------------+
only showing top 20 rows



In [12]:
#DataFrame Operations: Perform operations like filtering, selecting, grouping, and aggregating.

# Select specific columns
df_selected = df.select("user_id", "transaction_amount")


In [13]:
# Filter rows
df_filtered = df.filter(df["user_id"] > 10)

# Group by and aggregate
df_grouped = df.groupBy("user_id").agg(F.sum("transaction_amount").alias("sum_transaction_amount"))
df_grouped.show()

+-------+----------------------+
|user_id|sum_transaction_amount|
+-------+----------------------+
|     31|    2734.0200000000004|
|     85|               4844.78|
|     65|               5884.35|
|     53|               4328.05|
|     78|               4119.06|
|     34|               8613.88|
|     81|               2671.48|
|     28|     9781.330000000002|
|     76|               5682.46|
|     26|               5087.62|
|     27|               5184.84|
|     44|                2529.5|
|     12|     4981.290000000001|
|     91|    3996.2000000000007|
|     22|               6635.94|
|     93|     7866.179999999999|
|     47|    1936.5099999999998|
|      1|               5392.26|
|     52|               4924.38|
|     13|     5036.950000000001|
+-------+----------------------+
only showing top 20 rows



In [14]:
# SQL Queries: Register DataFrame as a temporary table and run SQL queries.

# Register the DataFrame as a temporary table
df.createOrReplaceTempView("my_table")

# Run a SQL query
sql_df = spark.sql("SELECT user_id, COUNT(*) FROM my_table GROUP BY user_id")
sql_df.show()

+-------+--------+
|user_id|count(1)|
+-------+--------+
|     31|       7|
|     85|       9|
|     65|      11|
|     53|      11|
|     78|       9|
|     34|      17|
|     81|       7|
|     28|      21|
|     76|       8|
|     26|      10|
|     27|       8|
|     44|       9|
|     12|       8|
|     91|       7|
|     22|      13|
|     93|      14|
|     47|       4|
|      1|       9|
|     52|       9|
|     13|       9|
+-------+--------+
only showing top 20 rows



In [15]:
spark.stop()