In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.sql import functions as F

In [3]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("TransformationsAndActions") \
    .getOrCreate()

In [4]:
# Define schema for the data
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("transaction_amount", DoubleType(), True)
])

In [5]:
# Load the CSV file into a DataFrame
file_path = 'data.csv'
df = spark.read.csv(file_path, schema=schema, header=True)

In [7]:
# Show the DataFrame
print("Original DataFrame:")
df.show()

Original DataFrame:
+-------+------------------+
|user_id|transaction_amount|
+-------+------------------+
|      5|            545.92|
|     98|            386.07|
|     30|            577.02|
|     14|            479.49|
|     34|            834.78|
|     29|            531.64|
|     72|            275.41|
|     79|            272.48|
|     38|            365.55|
|     47|            902.76|
|      8|            816.28|
|     82|            610.81|
|     29|             68.13|
|     34|            568.34|
|     26|            367.73|
|     98|             118.9|
|     54|            179.38|
|     78|             31.64|
|     43|            743.28|
|     61|             667.6|
+-------+------------------+
only showing top 20 rows



In [8]:
# Transformations
print("Selected Columns:")
df_selected = df.select("user_id", "transaction_amount")
df_selected.show()

Selected Columns:
+-------+------------------+
|user_id|transaction_amount|
+-------+------------------+
|      5|            545.92|
|     98|            386.07|
|     30|            577.02|
|     14|            479.49|
|     34|            834.78|
|     29|            531.64|
|     72|            275.41|
|     79|            272.48|
|     38|            365.55|
|     47|            902.76|
|      8|            816.28|
|     82|            610.81|
|     29|             68.13|
|     34|            568.34|
|     26|            367.73|
|     98|             118.9|
|     54|            179.38|
|     78|             31.64|
|     43|            743.28|
|     61|             667.6|
+-------+------------------+
only showing top 20 rows



In [9]:
print("Filtered Rows (transaction_amount > 500):")
df_filtered = df.filter(df["transaction_amount"] > 500)
df_filtered.show()

Filtered Rows (transaction_amount > 500):
+-------+------------------+
|user_id|transaction_amount|
+-------+------------------+
|      5|            545.92|
|     30|            577.02|
|     34|            834.78|
|     29|            531.64|
|     47|            902.76|
|      8|            816.28|
|     82|            610.81|
|     34|            568.34|
|     43|            743.28|
|     61|             667.6|
|      5|            829.57|
|     63|            812.76|
|     39|            891.23|
|     23|            533.22|
|     79|             631.1|
|     66|             964.8|
|     27|            999.26|
|     20|            536.66|
|     61|            830.55|
|     42|            533.76|
+-------+------------------+
only showing top 20 rows



In [10]:
print("New Column with Tax (transaction_amount * 1.1):")
df_with_new_column = df.withColumn("transaction_with_tax", df["transaction_amount"] * 1.1)
df_with_new_column.show()

New Column with Tax (transaction_amount * 1.1):
+-------+------------------+--------------------+
|user_id|transaction_amount|transaction_with_tax|
+-------+------------------+--------------------+
|      5|            545.92|   600.5120000000001|
|     98|            386.07|             424.677|
|     30|            577.02|             634.722|
|     14|            479.49|   527.4390000000001|
|     34|            834.78|             918.258|
|     29|            531.64|   584.8040000000001|
|     72|            275.41|   302.9510000000001|
|     79|            272.48|  299.72800000000007|
|     38|            365.55|             402.105|
|     47|            902.76|   993.0360000000001|
|      8|            816.28|             897.908|
|     82|            610.81|             671.891|
|     29|             68.13|              74.943|
|     34|            568.34|   625.1740000000001|
|     26|            367.73|  404.50300000000004|
|     98|             118.9|  130.79000000000002|
| 

In [11]:
print("Grouped by user_id and Aggregated Sum of transaction_amount:")
df_grouped = df.groupBy("user_id").agg(F.sum("transaction_amount").alias("total_amount"))
df_grouped.show()

Grouped by user_id and Aggregated Sum of transaction_amount:
+-------+------------------+
|user_id|      total_amount|
+-------+------------------+
|     31|2734.0200000000004|
|     85|           4844.78|
|     65|           5884.35|
|     53|           4328.05|
|     78|           4119.06|
|     34|           8613.88|
|     81|           2671.48|
|     28| 9781.330000000002|
|     76|           5682.46|
|     26|           5087.62|
|     27|           5184.84|
|     44|            2529.5|
|     12| 4981.290000000001|
|     91|3996.2000000000007|
|     22|           6635.94|
|     93| 7866.179999999999|
|     47|1936.5099999999998|
|      1|           5392.26|
|     52|           4924.38|
|     13| 5036.950000000001|
+-------+------------------+
only showing top 20 rows



In [12]:
print("Sorted by transaction_amount Descending:")
df_sorted = df.orderBy("transaction_amount", ascending=False)
df_sorted.show()

Sorted by transaction_amount Descending:
+-------+------------------+
|user_id|transaction_amount|
+-------+------------------+
|     27|            999.26|
|     80|            998.23|
|     24|            997.41|
|     60|            996.26|
|     45|             995.6|
|     27|            993.26|
|     41|            992.91|
|     23|            990.36|
|     80|             990.2|
|     70|            989.24|
|     25|            989.11|
|     53|            987.16|
|     74|            986.07|
|     32|            985.87|
|     20|            985.44|
|     40|            985.34|
|     65|            985.21|
|     49|            982.93|
|     43|             982.3|
|    100|            981.91|
+-------+------------------+
only showing top 20 rows



In [13]:
# Actions
print("Collected Data:")
collected_data = df.collect()
print(collected_data)

Collected Data:
[Row(user_id=5, transaction_amount=545.92), Row(user_id=98, transaction_amount=386.07), Row(user_id=30, transaction_amount=577.02), Row(user_id=14, transaction_amount=479.49), Row(user_id=34, transaction_amount=834.78), Row(user_id=29, transaction_amount=531.64), Row(user_id=72, transaction_amount=275.41), Row(user_id=79, transaction_amount=272.48), Row(user_id=38, transaction_amount=365.55), Row(user_id=47, transaction_amount=902.76), Row(user_id=8, transaction_amount=816.28), Row(user_id=82, transaction_amount=610.81), Row(user_id=29, transaction_amount=68.13), Row(user_id=34, transaction_amount=568.34), Row(user_id=26, transaction_amount=367.73), Row(user_id=98, transaction_amount=118.9), Row(user_id=54, transaction_amount=179.38), Row(user_id=78, transaction_amount=31.64), Row(user_id=43, transaction_amount=743.28), Row(user_id=61, transaction_amount=667.6), Row(user_id=28, transaction_amount=487.26), Row(user_id=87, transaction_amount=178.81), Row(user_id=5, transa

In [14]:
print(f"Row Count: {df.count()}")

Row Count: 1000


In [15]:
print("First Two Rows:")
first_two_rows = df.take(2)
print(first_two_rows)

First Two Rows:
[Row(user_id=5, transaction_amount=545.92), Row(user_id=98, transaction_amount=386.07)]


In [16]:
print("Describe the DataFrame:")
df.describe().show()

Describe the DataFrame:
+-------+----------------+------------------+
|summary|         user_id|transaction_amount|
+-------+----------------+------------------+
|  count|            1000|              1000|
|   mean|            49.8| 502.6222700000001|
| stddev|28.4594789471266|283.06873920421117|
|    min|               1|              1.22|
|    max|             100|            999.26|
+-------+----------------+------------------+



In [17]:
print("First Row:")
first_row = df.first()
print(first_row)

First Row:
Row(user_id=5, transaction_amount=545.92)


In [18]:
print("Schema:")
df.printSchema()

Schema:
root
 |-- user_id: integer (nullable = true)
 |-- transaction_amount: double (nullable = true)



In [19]:
spark.stop()