In [1]:
import pyspark
import os
import sys
from pyspark import SparkContext
from pyspark import SparkConf
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [9]:
#1  WORD COUNT

sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

# Load the text file
lines = sc.textFile("text.txt")

counts = lines.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda x, y: x + y)

output = counts.collect()

for (word, count) in output:
    print("%s: %i" % (word, count))

# Stop the SparkContext
sc.stop()

to: 2
be: 2
or: 1
not: 1
me: 1


In [10]:
#2 Filter and With column

from pyspark.sql import SparkSession
# Create a SparkSession
spark = SparkSession.builder.getOrCreate()
# Create a DataFrame
df = spark.createDataFrame([
(1, "Alice"),
(2, "Bob"),
(3, "Carol"),
(4, "Dave"),
(5, "Eve")
], ["id", "name"])
print(df.count())
df.show()
#df.write.format("csv").mode('overwrite').save('output')
# Filter the DataFrame to only include rows where the name starts with "A"
df = df.filter(df["name"].startswith("A"))
# Add a new column to the DataFrame called "age"
df = df.withColumn("age", df["id"] * 10)
# Print the DataFrame
df.show()

                                                                                

5
+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
|  3|Carol|
|  4| Dave|
|  5|  Eve|
+---+-----+

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 10|
+---+-----+---+



In [11]:
#3  Count and Show

spark = SparkSession.builder.appName("count_show").getOrCreate()

# Sample data for DataFrame
data = [("John", 28), ("Alice", 22), ("Bob", 32), ("Eve", 25)]

# Define the schema for the DataFrame
schema = ["Name", "Age"]

# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
print("Original DataFrame:")
df.show()

# Count the number of rows in the DataFrame
row_count = df.count()
print(f"Number of rows: {row_count}")

# Perform some other actions on the DataFrame as needed

# Stop the Spark session
spark.stop()

24/01/12 14:26:55 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Original DataFrame:
+-----+---+
| Name|Age|
+-----+---+
| John| 28|
|Alice| 22|
|  Bob| 32|
|  Eve| 25|
+-----+---+

Number of rows: 4


In [19]:
#4  Aggregations

from pyspark.sql.functions import col, sum, avg

spark = SparkSession.builder.appName("aggregations_example").getOrCreate()

# Sample data for DataFrame
data = [("John", "Sales", 1000),
        ("Alice", "Marketing", 1500),
        ("Bob", "Sales", 1200),
        ("Eve", "Marketing", 800),
        ("Charlie", "Sales", 2000)]

# Define the schema for the DataFrame
schema = ["Name", "Department", "Salary"]

# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show the original DataFrame
print("Original DataFrame:")
df.show()

# Perform basic aggregations
total_salary = df.groupBy("Department") \
                 .agg(sum("Salary").alias("TotalSalary"), avg("Salary").alias("AverageSalary"))

# Show the aggregated results
print("Aggregated DataFrame:")
total_salary.show()

# Stop the Spark session
spark.stop()

Original DataFrame:
+-------+----------+------+
|   Name|Department|Salary|
+-------+----------+------+
|   John|     Sales|  1000|
|  Alice| Marketing|  1500|
|    Bob|     Sales|  1200|
|    Eve| Marketing|   800|
|Charlie|     Sales|  2000|
+-------+----------+------+

Aggregated DataFrame:
+----------+-----------+-------------+
|Department|TotalSalary|AverageSalary|
+----------+-----------+-------------+
|     Sales|       4200|       1400.0|
| Marketing|       2300|       1150.0|
+----------+-----------+-------------+



In [21]:
#5 CSV

spark = SparkSession.builder.appName("write_to_csv_example").getOrCreate()

# Sample data for DataFrame
data = [("John", "Sales", 1000),
        ("Alice", "Marketing", 1500),
        ("Bob", "Sales", 1200),
        ("Eve", "Marketing", 800),
        ("Charlie", "Sales", 2000)]

# Define the schema for the DataFrame
schema = ["Name", "Department", "Salary"]

# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show the original DataFrame
print("Original DataFrame:")
df.show()

# Define the output CSV file path
output_path = "/home/lplab/Desktop/15B BDAL/output.csv"

# Write the DataFrame to a CSV file
df.write.csv(output_path, header=True, mode="overwrite")

# Stop the Spark session
spark.stop()

Original DataFrame:
+-------+----------+------+
|   Name|Department|Salary|
+-------+----------+------+
|   John|     Sales|  1000|
|  Alice| Marketing|  1500|
|    Bob|     Sales|  1200|
|    Eve| Marketing|   800|
|Charlie|     Sales|  2000|
+-------+----------+------+

