In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("DataFrame Example").getOrCreate()

# Create a DataFrame from a list of tuples
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [3]:
# Read a CSV file into a DataFrame
df1 = spark.read.csv("Employee.csv", header=True, inferSchema=True)

# Show the DataFrame
df1.show()

+---+-------+----------+------+-----+
| id|   name|department|salary|bonus|
+---+-------+----------+------+-----+
|  1|  Alice|     Sales|  5000|  200|
|  2|    Bob|   Finance|  6000|  300|
|  3|Charlie|     Sales|  5500|  250|
|  4|  David|   Finance|  7000|  400|
|  5|    Eva|        HR|  4500|  150|
|  6|  Frank|        HR|  4800| NULL|
|  7|  Grace|     Sales|  5200|  220|
|  8|   Hank|   Finance|  6500|  350|
+---+-------+----------+------+-----+



In [4]:
# Select the "Name" column
df.select("Name").show()

+-------+
|   Name|
+-------+
|  Alice|
|    Bob|
|Charlie|
+-------+



In [5]:
# Filter rows where Age > 30
df.filter(df.Age > 30).show()

+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 35|
+-------+---+



In [6]:
from pyspark.sql.functions import sum, avg

# Group by "Name" and calculate the sum of "Age"
df.groupBy("Name").agg(sum("Age").alias("Total Age")).show()

+-------+---------+
|   Name|Total Age|
+-------+---------+
|  Alice|       25|
|    Bob|       30|
|Charlie|       35|
+-------+---------+



In [7]:
from pyspark.sql.functions import lit

# Add a new column "Country" with value "USA"
df = df.withColumn("Country", lit("USA"))
df.show()

+-------+---+-------+
|   Name|Age|Country|
+-------+---+-------+
|  Alice| 25|    USA|
|    Bob| 30|    USA|
|Charlie| 35|    USA|
+-------+---+-------+



In [9]:
# New row to insert
new_row = [("Charlie", 35, "USA")]

# Create a DataFrame for the new row
new_df = spark.createDataFrame(new_row, columns)

# Show the new row DataFrame
print("New Row DataFrame:")
new_df.show()

New Row DataFrame:
+-------+---+---+
|   Name|Age| _3|
+-------+---+---+
|Charlie| 35|USA|
+-------+---+---+



In [10]:
combined_df = df.union(new_df)
combined_df.show()

+-------+---+-------+
|   Name|Age|Country|
+-------+---+-------+
|  Alice| 25|    USA|
|    Bob| 30|    USA|
|Charlie| 35|    USA|
|Charlie| 35|    USA|
+-------+---+-------+



In [11]:
# Drop duplicate rows
combined_df.dropDuplicates().show()

+-------+---+-------+
|   Name|Age|Country|
+-------+---+-------+
|  Alice| 25|    USA|
|    Bob| 30|    USA|
|Charlie| 35|    USA|
+-------+---+-------+



In [12]:
df = df.withColumnRenamed("Age", "Years")
df.show()

+-------+-----+-------+
|   Name|Years|Country|
+-------+-----+-------+
|  Alice|   25|    USA|
|    Bob|   30|    USA|
|Charlie|   35|    USA|
+-------+-----+-------+



In [None]:
# Create another DataFrame with additional information
data3 = [("Alice", "Sales"), ("Bob", "Finance"), ("Charlie", "HR")]
columns2 = ["Name", "Department"]
df3 = spark.createDataFrame(data3, columns2)

# Join the DataFrames on the "Name" column
df_join = df.join(df3, on="Name", how="inner")
df_join.show()

+-------+-----+-------+----------+
|   Name|Years|Country|Department|
+-------+-----+-------+----------+
|  Alice|   25|    USA|     Sales|
|    Bob|   30|    USA|   Finance|
|Charlie|   35|    USA|        HR|
+-------+-----+-------+----------+



: 