In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.functions import lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

# SparkSessionの初期化
spark = SparkSession.builder.appName("ColumnsOperation").getOrCreate()

# スキーマを定義
schema = StructType([
    StructField("Firstname", StringType(), True),
    StructField("Lastname", StringType(), True),
    StructField("Gender", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Department", StringType(), True),
    StructField("Salary", IntegerType(), True),    
])
# データを作成
data = [
    ("James", "Smith", "M", 30, "Sales", 3000),
    ("Anna", "Rose", "F", 41, "Engineering", 4000),
    ("Robert", "Williams", "M", 62, "Logistics", 5000),
]
# データフレームの作成
df = spark.createDataFrame(data, schema=schema)
df.printSchema()
df.show()

root
 |-- Firstname: string (nullable = true)
 |-- Lastname: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)

+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|    James|   Smith|     M| 30|      Sales|  3000|
|     Anna|    Rose|     F| 41|Engineering|  4000|
|   Robert|Williams|     M| 62|  Logistics|  5000|
+---------+--------+------+---+-----------+------+



In [2]:
# ===== 行の追加 union
# 追加する行を作成
new_data = [
    ("Michael", "Brown", "M", 34, "Sales", 3500),
    ("Maria", "Garcia", "F", 20, "Engineering", 2000),
    ("Mary", "Smith", "F", 30, "Accounting", 2500),
]
new_row = spark.createDataFrame(new_data, schema)
# データフレームに追加
df = df.union(new_row)
df.show()

+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|    James|   Smith|     M| 30|      Sales|  3000|
|     Anna|    Rose|     F| 41|Engineering|  4000|
|   Robert|Williams|     M| 62|  Logistics|  5000|
|  Michael|   Brown|     M| 34|      Sales|  3500|
|    Maria|  Garcia|     F| 20|Engineering|  2000|
|     Mary|   Smith|     F| 30| Accounting|  2500|
+---------+--------+------+---+-----------+------+



In [3]:
# ===== 条件をもとに行を抽出 filter/where
# === filter
print("===== sort")
# 式で抽出
df_filter = df.filter(df.Age > 30)
df_filter.show()
# 文字列条件で抽出
df_filter = df.filter("Age > 30")
df_filter.show()

# === where (whereはfilterの別名(エイリアス))
print("===== where")
# whereで式で抽出
df_where = df.where(df.Age > 30)
df_where.show()

# whereで文字列条件で抽出
df_where = df.where("Age > 30")
df_where.show()

===== sort
+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|     Anna|    Rose|     F| 41|Engineering|  4000|
|   Robert|Williams|     M| 62|  Logistics|  5000|
|  Michael|   Brown|     M| 34|      Sales|  3500|
+---------+--------+------+---+-----------+------+

+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|     Anna|    Rose|     F| 41|Engineering|  4000|
|   Robert|Williams|     M| 62|  Logistics|  5000|
|  Michael|   Brown|     M| 34|      Sales|  3500|
+---------+--------+------+---+-----------+------+

===== where
+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|     Anna|    Rose|     F| 41|Engineering|  4000|
|   Robert|Williams|     M| 62|  Logistics|  5000|
|  Mic

In [4]:
# ===== 行の削除 filter
# 行を削除
df_droped = df.filter(df.Firstname != "Michael")
df_droped.show()

+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|    James|   Smith|     M| 30|      Sales|  3000|
|     Anna|    Rose|     F| 41|Engineering|  4000|
|   Robert|Williams|     M| 62|  Logistics|  5000|
|    Maria|  Garcia|     F| 20|Engineering|  2000|
|     Mary|   Smith|     F| 30| Accounting|  2500|
+---------+--------+------+---+-----------+------+



In [5]:
from pyspark.sql.functions import col

# ===== 行の並び替え sort/orderBy
# === sortで並び替え
print("===== sort")
# 昇順
df_sort = df.sort(df.Age)
# df_sort = df.sort("Age")
df_sort.show()

# 降順
df_sort = df.sort(df.Age.desc())
# df_sort = df.sort(col("Age").desc())
df_sort.show()

# 複数列による並び替え
df_sort = df.sort(df.Age, df.Salary.desc())
# df_sort = df.sort("Age", col("Salary").desc())
df_sort.show()

# === orderByで並び替え
print("===== orderBy")
# 昇順
df_orderby = df.orderBy(df.Age)
# df_orderby = df.orderBy("Age")
df_orderby.show()

# 降順
df_orderby = df.orderBy(df.Age.desc())
# df_orderby = df.orderBy(col("Age").desc())
df_orderby.show()

# 複数条件を指定
df_orderby = df.orderBy(df.Age, df.Salary.desc())
# df_orderby = df.orderBy("Age", col("Salary").desc())
df_orderby.show()

===== sort
+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|    Maria|  Garcia|     F| 20|Engineering|  2000|
|     Mary|   Smith|     F| 30| Accounting|  2500|
|    James|   Smith|     M| 30|      Sales|  3000|
|  Michael|   Brown|     M| 34|      Sales|  3500|
|     Anna|    Rose|     F| 41|Engineering|  4000|
|   Robert|Williams|     M| 62|  Logistics|  5000|
+---------+--------+------+---+-----------+------+

+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|   Robert|Williams|     M| 62|  Logistics|  5000|
|     Anna|    Rose|     F| 41|Engineering|  4000|
|  Michael|   Brown|     M| 34|      Sales|  3500|
|     Mary|   Smith|     F| 30| Accounting|  2500|
|    James|   Smith|     M| 30|      Sales|  3000|
|    Maria|  Garcia|     F| 20|Engineering|  2000|
+---------+--------

In [6]:
# ===== 行のグループ化 groupBy
# グループ化して行をカウント
df_groupby_count = df.groupBy("Department").count()
df_groupby_count.show()

# グループ化して最大値を計算
df_groupby_max = df.groupBy("Department").max()
df_groupby_max.show()

+-----------+-----+
| Department|count|
+-----------+-----+
|      Sales|    2|
|Engineering|    2|
|  Logistics|    1|
| Accounting|    1|
+-----------+-----+

+-----------+--------+-----------+
| Department|max(Age)|max(Salary)|
+-----------+--------+-----------+
|      Sales|      34|       3500|
|Engineering|      41|       4000|
|  Logistics|      62|       5000|
| Accounting|      30|       2500|
+-----------+--------+-----------+



In [7]:
# 行の重複削除 dropDuplicates
# 重複行を追加
new_row = spark.createDataFrame([
    ("Michael", "Brown", "M", 34, "Sales", 3500),
    ("Maria", "Garcia", "F", 20, "Engineering", 2000),
    ("Mary", "Smith", "F", 30, "Accounting", 2500),
], schema)
# データフレームに追加
df_temp = df.union(new_row)
df_temp.show()

# ==== 重複行を削除
print("===== dropDuplicates")
# すべての列が重複するものを削除
df_drop = df_temp.dropDuplicates()
df_drop.show()

# 指定の列で重複する行を削除
df_drop = df_temp.dropDuplicates(["Lastname"])
df_drop.show()

# 複数列で重複する行を削除
df_drop = df_temp.dropDuplicates(["Firstname", "Lastname"])
df_drop.show()

+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|    James|   Smith|     M| 30|      Sales|  3000|
|     Anna|    Rose|     F| 41|Engineering|  4000|
|   Robert|Williams|     M| 62|  Logistics|  5000|
|  Michael|   Brown|     M| 34|      Sales|  3500|
|    Maria|  Garcia|     F| 20|Engineering|  2000|
|     Mary|   Smith|     F| 30| Accounting|  2500|
|  Michael|   Brown|     M| 34|      Sales|  3500|
|    Maria|  Garcia|     F| 20|Engineering|  2000|
|     Mary|   Smith|     F| 30| Accounting|  2500|
+---------+--------+------+---+-----------+------+

===== dropDuplicates
+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|    James|   Smith|     M| 30|      Sales|  3000|
|     Anna|    Rose|     F| 41|Engineering|  4000|
|   Robert|Williams|     M| 62|  Logistics|  5000|
|  Michae

In [8]:
# SparkSessionを終了
spark.stop()