# Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Occupation Exercise").getOrCreate()

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user).

In [None]:
!wget https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user -O u.user

--2025-06-20 08:41:39--  https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22667 (22K) [text/plain]
Saving to: ‘u.user’


2025-06-20 08:41:39 (15.4 MB/s) - ‘u.user’ saved [22667/22667]



### Step 3. Assign it to a variable called users and use the 'user_id' as index

In [None]:
users = spark.read.format("csv") \
.option("delimiter","|") \
.option("header","true") \
.option("inferSchema","true") \
.load("u.user")

### Step 4. See the first 25 entries

In [None]:
users.show(25)

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      1| 24|     M|   technician|   85711|
|      2| 53|     F|        other|   94043|
|      3| 23|     M|       writer|   32067|
|      4| 24|     M|   technician|   43537|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|   05201|
|      9| 29|     M|      student|   01002|
|     10| 53|     M|       lawyer|   90703|
|     11| 39|     F|        other|   30329|
|     12| 28|     F|        other|   06405|
|     13| 47|     M|     educator|   29206|
|     14| 45|     M|    scientist|   55106|
|     15| 49|     F|     educator|   97301|
|     16| 21|     M|entertainment|   10309|
|     17| 30|     M|   programmer|   06355|
|     18| 35|     F|        other|   37212|
|     19| 40|     M|    librarian|   02138|
|     20| 42|     F|    homemake

### Step 5. See the last 10 entries

In [None]:
users.tail(10)

[Row(user_id=934, age=61, gender='M', occupation='engineer', zip_code='22902'),
 Row(user_id=935, age=42, gender='M', occupation='doctor', zip_code='66221'),
 Row(user_id=936, age=24, gender='M', occupation='other', zip_code='32789'),
 Row(user_id=937, age=48, gender='M', occupation='educator', zip_code='98072'),
 Row(user_id=938, age=38, gender='F', occupation='technician', zip_code='55038'),
 Row(user_id=939, age=26, gender='F', occupation='student', zip_code='33319'),
 Row(user_id=940, age=32, gender='M', occupation='administrator', zip_code='02215'),
 Row(user_id=941, age=20, gender='M', occupation='student', zip_code='97229'),
 Row(user_id=942, age=48, gender='F', occupation='librarian', zip_code='78209'),
 Row(user_id=943, age=22, gender='M', occupation='student', zip_code='77841')]

### Step 6. What is the number of observations in the dataset?

In [None]:
users.count()

943

### Step 7. What is the number of columns in the dataset?

In [None]:
print(len(users.columns))

5


### Step 8. Print the name of all the columns.

In [None]:
users.columns

['user_id', 'age', 'gender', 'occupation', 'zip_code']

### Step 9. How is the dataset indexed?

In [None]:
from pyspark.sql.functions import row_number
indexed_users = users.withColumn("row_index", row_number().over(Window.orderBy("user_id")))
indexed_users.show()

+-------+---+------+-------------+--------+---------+
|user_id|age|gender|   occupation|zip_code|row_index|
+-------+---+------+-------------+--------+---------+
|      1| 24|     M|   technician|   85711|        1|
|      2| 53|     F|        other|   94043|        2|
|      3| 23|     M|       writer|   32067|        3|
|      4| 24|     M|   technician|   43537|        4|
|      5| 33|     F|        other|   15213|        5|
|      6| 42|     M|    executive|   98101|        6|
|      7| 57|     M|administrator|   91344|        7|
|      8| 36|     M|administrator|   05201|        8|
|      9| 29|     M|      student|   01002|        9|
|     10| 53|     M|       lawyer|   90703|       10|
|     11| 39|     F|        other|   30329|       11|
|     12| 28|     F|        other|   06405|       12|
|     13| 47|     M|     educator|   29206|       13|
|     14| 45|     M|    scientist|   55106|       14|
|     15| 49|     F|     educator|   97301|       15|
|     16| 21|     M|entertai

### Step 10. What is the data type of each column?

In [None]:
users.dtypes

[('user_id', 'int'),
 ('age', 'int'),
 ('gender', 'string'),
 ('occupation', 'string'),
 ('zip_code', 'string')]

### Step 11. Print only the occupation column

In [None]:
user.select("occupation").show()

+-------------+
|   occupation|
+-------------+
|   technician|
|        other|
|       writer|
|   technician|
|        other|
|    executive|
|administrator|
|administrator|
|      student|
|       lawyer|
|        other|
|        other|
|     educator|
|    scientist|
|     educator|
|entertainment|
|   programmer|
|        other|
|    librarian|
|    homemaker|
+-------------+
only showing top 20 rows



### Step 12. How many different occupations are in this dataset?

In [None]:
users.select("occupation").distinct().count()

21

### Step 13. What is the most frequent occupation?

In [None]:
from pyspark.sql.functions import count, desc, col
occ_users = users.groupBy("occupation").agg(count("*").alias("Users_per_occ"))
most_freq_occ = occ_users.orderBy(col("Users_per_occ").desc()).first()
print(most_freq_occ)

Row(occupation='student', Users_per_occ=196)


### Step 14. Summarize the DataFrame.

In [None]:
users.describe().show()

+-------+-----------------+-----------------+------+-------------+------------------+
|summary|          user_id|              age|gender|   occupation|          zip_code|
+-------+-----------------+-----------------+------+-------------+------------------+
|  count|              943|              943|   943|          943|               943|
|   mean|            472.0|34.05196182396607|  NULL|         NULL| 50868.78810810811|
| stddev|272.3649512449549|12.19273973305903|  NULL|         NULL|30891.373254138176|
|    min|                1|                7|     F|administrator|             00000|
|    max|              943|               73|     M|       writer|             Y1A6B|
+-------+-----------------+-----------------+------+-------------+------------------+



### Step 15. Summarize all the columns

In [None]:
users.summary().show()

+-------+-----------------+-----------------+------+-------------+------------------+
|summary|          user_id|              age|gender|   occupation|          zip_code|
+-------+-----------------+-----------------+------+-------------+------------------+
|  count|              943|              943|   943|          943|               943|
|   mean|            472.0|34.05196182396607|  NULL|         NULL| 50868.78810810811|
| stddev|272.3649512449549|12.19273973305903|  NULL|         NULL|30891.373254138176|
|    min|                1|                7|     F|administrator|             00000|
|    25%|              236|               25|  NULL|         NULL|           21227.0|
|    50%|              472|               31|  NULL|         NULL|           53711.0|
|    75%|              708|               43|  NULL|         NULL|           78741.0|
|    max|              943|               73|     M|       writer|             Y1A6B|
+-------+-----------------+-----------------+------+--

### Step 16. Summarize only the occupation column

In [None]:
users.select("occupation").summary().show()

+-------+-------------+
|summary|   occupation|
+-------+-------------+
|  count|          943|
|   mean|         NULL|
| stddev|         NULL|
|    min|administrator|
|    25%|         NULL|
|    50%|         NULL|
|    75%|         NULL|
|    max|       writer|
+-------+-------------+



### Step 17. What is the mean age of users?

In [None]:
from pyspark.sql.functions import mean
users.agg(mean("age").alias("Mean_users_age")).show()

+-----------------+
|   Mean_users_age|
+-----------------+
|34.05196182396607|
+-----------------+



### Step 18. What is the age with least occurrence?

In [None]:
count_ages = users.groupBy("age").agg(count("age").alias("Count_age"))
count_ages.orderBy(col("Count_age").asc()).first()

Row(age=7, Count_age=1)