In [None]:
# Import SparkSession from pyspark.sql
from pyspark.sql import SparkSession

# Create my_spark
my_spark = SparkSession.builder.appName("my_spark").getOrCreate()

# Print my_spark
print(my_spark)

<pyspark.sql.session.SparkSession object at 0x7a3582e1e1d0>


In [None]:
# Read the CSV file
census_adult = my_spark.read.csv("adult_reduced.csv", header=True, inferSchema=True)

# Display the DataFrame
census_adult.show()

+---+------------+------+-------------+--------------------+------+
|age|   education|   sex| relationship|      marital-status|income|
+---+------------+------+-------------+--------------------+------+
| 39|   Bachelors|  Male|Not-in-family|       Never-married| <=50K|
| 50|   Bachelors|  Male|      Husband|  Married-civ-spouse| <=50K|
| 38|     HS-grad|  Male|Not-in-family|            Divorced| <=50K|
| 53|        11th|  Male|      Husband|  Married-civ-spouse| <=50K|
| 28|   Bachelors|Female|         Wife|  Married-civ-spouse| <=50K|
| 37|     Masters|Female|         Wife|  Married-civ-spouse| <=50K|
| 49|         9th|Female|Not-in-family|Married-spouse-ab...| <=50K|
| 52|     HS-grad|  Male|      Husband|  Married-civ-spouse|  >50K|
| 31|     Masters|Female|Not-in-family|       Never-married|  >50K|
| 42|   Bachelors|  Male|      Husband|  Married-civ-spouse|  >50K|
| 37|Some-college|  Male|      Husband|  Married-civ-spouse|  >50K|
| 30|   Bachelors|  Male|      Husband|  Married

In [None]:
# Show the schema
census_adult.printSchema()

root
 |-- age: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- income: string (nullable = true)



# Basic analytics on PySpark DataFrmes

In [None]:
# .count() will return the total row numbers in the DataFrame
row_count = census_adult.count()
print(f'Number of rows: {row_count}')

Number of rows: 32561


In [None]:
# groupby() allow the use of sql-like aggregtions
census_adult.groupBy('sex').agg({'age': 'avg'}).show()

+------+-----------------+
|   sex|         avg(age)|
+------+-----------------+
|Female|36.85823043357163|
|  Male|39.43354749885268|
+------+-----------------+



In [None]:
# Using filter and select we can narrow down our DataFrame
filtered_census_adult = census_adult.filter(census_adult['age'] > 50).select('age', 'education')
filtered_census_adult.show()

+---+------------+
|age|   education|
+---+------------+
| 53|        11th|
| 52|     HS-grad|
| 54|     HS-grad|
| 59|     HS-grad|
| 56|   Bachelors|
| 54|Some-college|
| 53|   Bachelors|
| 57|   Bachelors|
| 53|     HS-grad|
| 53|     HS-grad|
| 79|Some-college|
| 67|        10th|
| 52|   Bachelors|
| 59|     HS-grad|
| 53|     HS-grad|
| 57|   Assoc-voc|
| 76|     Masters|
| 56|     HS-grad|
| 53|         9th|
| 56|Some-college|
+---+------------+
only showing top 20 rows



In [None]:
# Load the CSV file into a DataFrame
salaries_df = my_spark.read.csv("Salaries.csv", header=True, inferSchema=True)

# Count the total number of rows
row_count = salaries_df.count()
print(f"Total rows: {row_count}")

# Show the schema
salaries_df.printSchema()

salaries_df.show()

# Group by company size and calculate the average of salaries
salaries_df.groupBy("rank").agg({"salary": "avg"}).show()
# salaries_df.show()

Total rows: 397
root
 |-- rank: string (nullable = true)
 |-- discipline: string (nullable = true)
 |-- yrs.since.phd: integer (nullable = true)
 |-- yrs.service: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+-------------+-----------+------+------+
|     rank|discipline|yrs.since.phd|yrs.service|   sex|salary|
+---------+----------+-------------+-----------+------+------+
|     Prof|         B|           19|         18|  Male|139750|
|     Prof|         B|           20|         16|  Male|173200|
| AsstProf|         B|            4|          3|  Male| 79750|
|     Prof|         B|           45|         39|  Male|115000|
|     Prof|         B|           40|         41|  Male|141500|
|AssocProf|         B|            6|          6|  Male| 97000|
|     Prof|         B|           30|         23|  Male|175000|
|     Prof|         B|           45|         45|  Male|147765|
|     Prof|         B|           21|       

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Fill in the schema with the columns you need from the exercise instructions
schema = StructType([StructField("age",IntegerType()),
                     StructField("education_num",IntegerType()),
                     StructField("marital_status",StringType()),
                     StructField("occupation",StringType()),
                     StructField("income",StringType()),
                    ])

# Read in the CSV, using the schema you defined above
# census_adult = my_spark.read.csv("adult_reduced_100.csv", sep=',', header=False, schema=schema)

# Print out the schema
# census_adult.printSchema()

In [None]:
# Drop rows with any nulls
df_cleaned = df.na.drop()

# Filter out nulls
df_cleaned = df.where(col("columnName").isNotNull())

# Fill nulls in the age column with the value 0
df_filled = df.na.fill({"age": 0})


In [None]:
# Create a new column 'age_plus_5"
df = df.withColumn("age_plus_5", df["age"] + 5)

# Rename the 'age' column to 'year'
df = df.withColumnRenamed("age", "year")

# Drop the 'department column
df = df.drop("department")


In [None]:
# Examine the data
print("airports")
airports.printSchema()
airports.show()
print("flights")
flights.printSchema()
flights.show()

# .withColumnRenamed() renames the "faa" column to "dest"
airports = airports.withColumnRenamed("faa", "dest")

# Join the DataFrames
flights_with_airports = flights.join(airports,
on="dest", how='leftouter')

# Examine the new DataFrame
flights_with_airports.show()

In [None]:
# Register the function age_category as a UDF
age_category_udf = udf(age_category, StringType())

# Apply your udf to the DataFrame
age_category_df_2 = age_category_df.withColumn("category", age_category_udf(age_category_df["age"]))

# Show df
age_category_df_2.show()