# **Using PySpark in Google Colab**

This notebook sets up and runs PySpark in Google Colab.

In [None]:
# Step 1: Install Java and PySpark
!apt-get install openjdk-11-jdk -y
!pip install pyspark

In [None]:
# Step 2: Import PySpark and Create a Spark Session
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("ColabSpark").getOrCreate()

# Print Spark version
print("Apache Spark version:", spark.version)

In [None]:
# Step 3: Create a Sample DataFrame
from pyspark.sql import Row

# Create sample data
data = [
    Row(id=1, name="Alice", age=25),
    Row(id=2, name="Bob", age=30),
    Row(id=3, name="Charlie", age=35)
]

# Convert to a Spark DataFrame
df = spark.createDataFrame(data)

# Show the data
df.show()

In [None]:
# Step 4: Perform Basic Data Operations

# Print schema
df.printSchema()

# Select specific columns
df.select("name", "age").show()

# Filter data where age > 30
df.filter(df.age > 30).show()

# Group by and count
df.groupBy("age").count().show()

In [None]:
# Step 5: Stop the Spark Session
spark.stop()