In [0]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

data = [
    ["C00295", "Rajesh", "Kapur", 38, "Artist"],
    ["C00296", "Ben", "Smith", 39, "Banker"],
    ["C00297", "Rohn", "Couper", 30, "Painter"],
    ["C00298", "Rohn", "Couper", 30, "Painter"],
    ["C00299", "John", "Doe", 45, "Engineer"],
    ["C00299", "Roman", "Adam", 45, "Accountant"],
    ["C00299", "Robert", "Allen", 45, "Teacher"],
    ["C00299", "Rowan", "Anderson", 45, "Driver"],
    ["C00299", "River", "Baker", 45, "Driver"]

 ]

#Prints Number of records
rdd = sc.parallelize(data)

filtered_rdd = rdd.filter(lambda name: name[1].startswith("R"))

count = filtered_rdd.count()

print(count)

7


In [0]:
#Prints all name starting with R
rdd = sc.parallelize(data)

filtered_rdd = rdd.filter(lambda name: name[1].startswith("R"))

# Collect the filtered RDD into a list
collected_rdd = filtered_rdd.collect()

# Print each name starting with R
for name in collected_rdd:
    print(name[1])

Rajesh
Rohn
Rohn
Roman
Robert
Rowan
River


In [0]:
!pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-6fdf71db-c2a7-46ea-aff2-58fbb7888afc/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
#Prints entire row where name starts with R
import tabulate
# Print the entire row
#rdd = sc.parallelize(data)
filtered_rdd = rdd.filter(lambda name: name[1].startswith("R"))
headers = ["CustID", "Fname", "Lname", "Age", "Profession"]
# Print the filtered data rows in a tabular format
print(tabulate.tabulate(filtered_rdd.collect(), headers=headers))

CustID    Fname    Lname       Age  Profession
--------  -------  --------  -----  ------------
C00295    Rajesh   Kapur        38  Artist
C00297    Rohn     Couper       30  Painter
C00298    Rohn     Couper       30  Painter
C00299    Roman    Adam         45  Accountant
C00299    Robert   Allen        45  Teacher
C00299    Rowan    Anderson     45  Driver
C00299    River    Baker        45  Driver


In [0]:
#Joining and Preparing DataFrames for Student and Marks Data in PySpark
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.appName("StudentMarksJoin").getOrCreate()

In [0]:
# Create Student DataFrame from data
student_data = [
    (1, "John", "Smith", "Male", 18),
    (2, "Emma", "Johnson", "Female", 19),
    (3, "Michael", "Davis", "Male", 20),
    (4, "Sophia", "Brown", "Female", 18),
    (5, "Daniel", "Miller", "Male", 19),
]

student_df = spark.createDataFrame(student_data,
                                  schema=["StudentID", "FirstName", "LastName", "Gender", "Age"])

In [0]:
# Create Marks DataFrame from data
marks_data = [
    (1, "Math", 85),
    (2, "Science", 92),
    (3, "English", 78),
    (4, "Math", 89),
    (5, "Science", 75),
    (6, "English", 94),
    (7, "Math", 80),
    (8, "Science", 88),
]

marks_df = spark.createDataFrame(marks_data, schema=["StudentID", "Subject", "Marks"])

In [0]:
#Join DataFrames:
# Inner Join: Students with their marks
joined_df = student_df.join(marks_df, "StudentID", "inner")
joined_df.show()

+---------+---------+--------+------+---+-------+-----+
|StudentID|FirstName|LastName|Gender|Age|Subject|Marks|
+---------+---------+--------+------+---+-------+-----+
|        1|     John|   Smith|  Male| 18|   Math|   85|
|        2|     Emma| Johnson|Female| 19|Science|   92|
|        3|  Michael|   Davis|  Male| 20|English|   78|
|        4|   Sophia|   Brown|Female| 18|   Math|   89|
|        5|   Daniel|  Miller|  Male| 19|Science|   75|
+---------+---------+--------+------+---+-------+-----+



In [0]:
#Select Specific Columns
filtered_df = joined_df.select("StudentID", "FirstName", "LastName", "Subject", "Marks")
filtered_df.show()

+---------+---------+--------+-------+-----+
|StudentID|FirstName|LastName|Subject|Marks|
+---------+---------+--------+-------+-----+
|        1|     John|   Smith|   Math|   85|
|        2|     Emma| Johnson|Science|   92|
|        3|  Michael|   Davis|English|   78|
|        4|   Sophia|   Brown|   Math|   89|
|        5|   Daniel|  Miller|Science|   75|
+---------+---------+--------+-------+-----+



In [0]:
#Filter Rows Based on Conditions
filtered_df = joined_df.filter((joined_df["Marks"] > 80) & (joined_df["Subject"] == "Math"))
filtered_df.show()

+---------+---------+--------+------+---+-------+-----+
|StudentID|FirstName|LastName|Gender|Age|Subject|Marks|
+---------+---------+--------+------+---+-------+-----+
|        1|     John|   Smith|  Male| 18|   Math|   85|
|        4|   Sophia|   Brown|Female| 18|   Math|   89|
+---------+---------+--------+------+---+-------+-----+



In [0]:
# Calculate the average marks for each student
from pyspark.sql.functions import col, avg  # Import avg function
average_marks_df = joined_df.groupBy("StudentID", "FirstName", "LastName", "Gender", "Age") \
                            .agg(avg("Marks").alias("AverageMarks"))

# Show the result
average_marks_df.show()

+---------+---------+--------+------+---+------------+
|StudentID|FirstName|LastName|Gender|Age|AverageMarks|
+---------+---------+--------+------+---+------------+
|        1|     John|   Smith|  Male| 18|        85.0|
|        2|     Emma| Johnson|Female| 19|        92.0|
|        3|  Michael|   Davis|  Male| 20|        78.0|
|        4|   Sophia|   Brown|Female| 18|        89.0|
|        5|   Daniel|  Miller|  Male| 19|        75.0|
+---------+---------+--------+------+---+------------+



In [0]:
#Reading A text file in the dataframe
# Import libraries
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.appName("ReadCustsFile").getOrCreate()

# Define file path
file_path = "/FileStore/tables/custs.txt"

# Read the file as a DataFrame
custs_df = spark.read.csv(file_path, header=True)

# Show the first 5 rows
custs_df.show(5)

+---+--------+-------+---+--------------------+
|  1|Kristina|  Chung| 55|               Pilot|
+---+--------+-------+---+--------------------+
|  2|   Paige|   Chen| 74|             Teacher|
|  3|  Sherri| Melton| 34|         Firefighter|
|  4|Gretchen|   Hill| 66|Computer hardware...|
|  5|   Karen|Puckett| 74|              Lawyer|
|  6| Patrick|   Song| 42|        Veterinarian|
+---+--------+-------+---+--------------------+
only showing top 5 rows

