In [None]:
pip install pyspark

In [None]:
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDDOperationsExample")

# Create an RDD from a local data source
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Transformations and Actions

# Map transformation: Square each element
squared_rdd = rdd.map(lambda x: x * x)

# Filter transformation: Select only even numbers
even_rdd = rdd.filter(lambda x: x % 2 == 0)

# Reduce action: Calculate the sum of all elements
sum_of_elements = rdd.reduce(lambda x, y: x + y)

# Aggregate action: Calculate the sum and count of elements
sum_count = rdd.aggregate((0, 0),
                          lambda acc, value: (acc[0] + value, acc[1] + 1),
                          lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))

# Print the results
print("Original RDD:")
for element in rdd.collect():
    print(element)

print("Squared RDD:")
for element in squared_rdd.collect():
    print(element)

print("Even RDD:")
for element in even_rdd.collect():
    print(element)

print("Sum of Elements:", sum_of_elements)
print("Sum and Count:", sum_count)

# Stop the SparkContext
sc.stop()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrameOperations").getOrCreate()

# Load the CSV file into a DataFrame
csv_file_path = "path_to_your_csv_file.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Filtering
filtered_df = df.filter(col("age") > 30)

# Grouping
grouped_df = df.groupBy("gender").count()

# Joining
other_data_path = "path_to_other_data_file.csv"
other_data_df = spark.read.csv(other_data_path, header=True, inferSchema=True)
joined_df = df.join(other_data_df, "id")

# Display the results
print("Filtered DataFrame:")
filtered_df.show()

print("Grouped DataFrame:")
grouped_df.show()

print("Joined DataFrame:")
joined_df.show()

# Register the DataFrame as a temporary view for Spark SQL
df.createOrReplaceTempView("my_table")

# Apply Spark SQL queries
sql_query = "SELECT gender, AVG(age) AS average_age FROM my_table GROUP BY gender"
result_df = spark.sql(sql_query)

print("Result of Spark SQL Query:")
result_df.show()

# Stop the SparkSession
spark.stop()


In [None]:
from pyspark.streaming import StreamingContext
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local[2]", "SparkStreamingExample")

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(sc, 1)

# Configure the StreamingContext to consume data from a socket
hostname = "localhost"
port = 9999
lines = ssc.socketTextStream(hostname, port)

# Perform streaming transformations and actions

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count the occurrences of each word
word_counts = words.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)

# Print the word counts
word_counts.pprint()

# Start the streaming context
ssc.start()

# Wait for the streaming to finish
ssc.awaitTermination()


In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("SparkSQLDataSources") \
    .getOrCreate()

# Connect Spark with a relational database
database_url = "jdbc:postgresql://localhost:5432/mydatabase"
database_properties = {
    "user": "your_username",
    "password": "your_password",
    "driver": "org.postgresql.Driver"
}

# Read data from the database using Spark SQL
df = spark.read \
    .format("jdbc") \
    .option("url", database_url) \
    .option("dbtable", "your_table_name") \
    .option("user", database_properties["user"]) \
    .option("password", database_properties["password"]) \
    .option("driver", database_properties["driver"]) \
    .load()

# Perform SQL operations on the DataFrame
df.createOrReplaceTempView("my_table")
result = spark.sql("SELECT * FROM my_table WHERE age > 30")

# Print the result
result.show()

# Explore integration capabilities with other data sources
hdfs_file_path = "hdfs://localhost:9000/path/to/your/file.csv"
s3_file_path = "s3a://your_bucket/path/to/your/file.csv"

# Read data from HDFS
hdfs_df = spark.read.csv(hdfs_file_path, header=True, inferSchema=True)

# Read data from Amazon S3
s3_df = spark.read.csv(s3_file_path, header=True, inferSchema=True)

# Perform operations on the DataFrames
# ...

# Stop the SparkSession
spark.stop()
