# Prerequisites

* Load users_001.csv into DataFrame



In [0]:
users_df = spark.read.csv(
    path="/FileStore/tables/users_001.csv", header=True, quote="'",inferSchema=True
)
users_df.limit(10).display()

id,name,dob,email,gender,country,region,city,asset,marital_status
1,Heather Gibbs,2024-10-31,heathergibbs6243@gmail.com,Female,United States,Virginia,Virginia Beach,734388,Married
2,Herrod Petersen,2024-02-19,herrodpetersen@yahoomail.com,Male,United States,Arizona,Phoenix,113506,Single
3,Ocean Workman,2024-10-10,oceanworkman2328@ymail.com,Male,United States,Tennessee,Clarksville,139985,Married
4,Xaviera Maxwell,2025-03-09,xavieramaxwell@gmail.com,Transgender,United States,Ohio,Cleveland,511409,married
5,Bo Underwood,2024-06-30,bounderwood@ymail.com,Male,India,Tamil Nadu,Madurai,366783,Married
6,Graiden Mcleod,2023-11-09,graidenmcleod1069@ymail.com,Male,United States,Oregon,Salem,638977,Divorced
7,Chantale Nixon,2023-09-06,chantalenixon404@ymail.com,Female,India,Andaman and Nicobar Islands,Port Blair,717994,Married
8,Ashton Willis,2025-02-22,ashtonwillis@gmail.com,Male,India,Dadra and Nagar Haveli,Silvassa,483841,Married
9,Mercedes Lawrence,2024-08-15,mercedeslawrence4115@ymail.com,Male,India,Punjab,Gujranwala,598378,Single
10,Norman Patton,2024-02-25,normanpatton6042@yahoomail.com,Transgender,India,Chhattisgarh,Durg,412938,Married


# Basic Operations

1. Display the total count of records excluding headers
2. Print the column names
3. Display top n records without truncating the data
4. Select 'full_name,email" columns

In [0]:
users_df.count()

Out[3]: 500

In [0]:
L = [1,2,3]
print(L)

[1, 2, 3]


## Print Column names

In [0]:
for column in users_df.columns:
  print(column)

id
name
dob
email
gender
country
region
city
asset
marital_status


## Select columns

In [0]:
from pyspark.sql.functions import col

users_df.select(col("name").alias("Full Name"), col("email")).limit(3).display()

Full Name,email
Heather Gibbs,heathergibbs6243@gmail.com
Herrod Petersen,herrodpetersen@yahoomail.com
Ocean Workman,oceanworkman2328@ymail.com


# Filter Operations

* Filter Records matching the country 'India'
* Fetch all names who belong to country 'India' and whose maritial_status is "Single"

In [0]:
filtered_records = users_df.filter(col("country") == "India")
print(f"Records beloing to India: {filtered_records.count()}")
filtered_records.limit(5).display()

Records beloing to India: 247


id,name,dob,email,gender,country,region,city,asset,marital_status
5,Bo Underwood,2024-06-30,bounderwood@ymail.com,Male,India,Tamil Nadu,Madurai,366783,Married
7,Chantale Nixon,2023-09-06,chantalenixon404@ymail.com,Female,India,Andaman and Nicobar Islands,Port Blair,717994,Married
8,Ashton Willis,2025-02-22,ashtonwillis@gmail.com,Male,India,Dadra and Nagar Haveli,Silvassa,483841,Married
9,Mercedes Lawrence,2024-08-15,mercedeslawrence4115@ymail.com,Male,India,Punjab,Gujranwala,598378,Single
10,Norman Patton,2024-02-25,normanpatton6042@yahoomail.com,Transgender,India,Chhattisgarh,Durg,412938,Married


In [0]:
# Fetch all names who belong to country 'India' and whose maritial_status is "Single"

from pyspark.sql.functions import lower, trim

df_01 = users_df.filter(trim(lower(col("marital_status"))) == "married").select(
    col("id"), col("name").alias("Full Name"), col("marital_status")
)
print(f"Count: {df_01.count()}")
df_01.limit(15).display()

Count: 123


id,Full Name,marital_status
1,Heather Gibbs,Married
3,Ocean Workman,Married
4,Xaviera Maxwell,married
5,Bo Underwood,Married
7,Chantale Nixon,Married
8,Ashton Willis,Married
10,Norman Patton,Married
12,Jana Barr,Married
17,Dominique Horton,Married
20,Dean Beard,Married


In [0]:
dbutils.fs.rm("/FileStore/tables/users_001.csv")

Out[52]: True