In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

# SparkSessionの初期化
spark = SparkSession.builder.appName("ColumnsOperation").getOrCreate()

# スキーマを定義
schema = StructType([
    StructField("Firstname", StringType(), True),
    StructField("Lastname", StringType(), True),
    StructField("Gender", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Department", StringType(), True),
    StructField("Salary", IntegerType(), True),    
])
# データを作成
data = [
    ("James", "Smith", "M", 30, "Sales", 3000),
    ("Anna", "Rose", "F", 41, "Engineering", 4000),
    ("Robert", "Williams", "M", 62, "Logistics", 5000),
]
# データフレームの作成
df = spark.createDataFrame(data, schema=schema)
df.printSchema()
df.show()

root
 |-- Firstname: string (nullable = true)
 |-- Lastname: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)

+---------+--------+------+---+-----------+------+
|Firstname|Lastname|Gender|Age| Department|Salary|
+---------+--------+------+---+-----------+------+
|    James|   Smith|     M| 30|      Sales|  3000|
|     Anna|    Rose|     F| 41|Engineering|  4000|
|   Robert|Williams|     M| 62|  Logistics|  5000|
+---------+--------+------+---+-----------+------+



In [2]:
from pyspark.sql.functions import lit

#===== 列の追加 withColumn
# 定数列を追加
df = df.withColumn("Country", lit("USA"))
df = df.withColumn("Rate", lit(1.1).cast(FloatType()))
df.printSchema()
df.show()

root
 |-- Firstname: string (nullable = true)
 |-- Lastname: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Country: string (nullable = false)
 |-- Rate: float (nullable = false)

+---------+--------+------+---+-----------+------+-------+----+
|Firstname|Lastname|Gender|Age| Department|Salary|Country|Rate|
+---------+--------+------+---+-----------+------+-------+----+
|    James|   Smith|     M| 30|      Sales|  3000|    USA| 1.1|
|     Anna|    Rose|     F| 41|Engineering|  4000|    USA| 1.1|
|   Robert|Williams|     M| 62|  Logistics|  5000|    USA| 1.1|
+---------+--------+------+---+-----------+------+-------+----+



In [3]:
# ===== 列の名称変更 withColumnRenamed
df = df.withColumnRenamed("Department", "Dept")
df.printSchema()
df.show()

root
 |-- Firstname: string (nullable = true)
 |-- Lastname: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Dept: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Country: string (nullable = false)
 |-- Rate: float (nullable = false)

+---------+--------+------+---+-----------+------+-------+----+
|Firstname|Lastname|Gender|Age|       Dept|Salary|Country|Rate|
+---------+--------+------+---+-----------+------+-------+----+
|    James|   Smith|     M| 30|      Sales|  3000|    USA| 1.1|
|     Anna|    Rose|     F| 41|Engineering|  4000|    USA| 1.1|
|   Robert|Williams|     M| 62|  Logistics|  5000|    USA| 1.1|
+---------+--------+------+---+-----------+------+-------+----+



In [4]:
# ===== 列の削除 drop
df = df.drop("Country")
df.printSchema()
df.show()

root
 |-- Firstname: string (nullable = true)
 |-- Lastname: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Dept: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Rate: float (nullable = false)

+---------+--------+------+---+-----------+------+----+
|Firstname|Lastname|Gender|Age|       Dept|Salary|Rate|
+---------+--------+------+---+-----------+------+----+
|    James|   Smith|     M| 30|      Sales|  3000| 1.1|
|     Anna|    Rose|     F| 41|Engineering|  4000| 1.1|
|   Robert|Williams|     M| 62|  Logistics|  5000| 1.1|
+---------+--------+------+---+-----------+------+----+



In [5]:
# ===== 列の選択 select
df_selected = df.select(["Firstname", "Age"])
df_selected.printSchema()
df_selected.show()

root
 |-- Firstname: string (nullable = true)
 |-- Age: integer (nullable = true)

+---------+---+
|Firstname|Age|
+---------+---+
|    James| 30|
|     Anna| 41|
|   Robert| 62|
+---------+---+



In [6]:
from pyspark.sql.functions import expr

# ===== 式の適用 expr
df = df.withColumn("Increased Salary", expr("Salary * Rate"))
df.printSchema()
df.show()

root
 |-- Firstname: string (nullable = true)
 |-- Lastname: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Dept: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Rate: float (nullable = false)
 |-- Increased Salary: float (nullable = true)

+---------+--------+------+---+-----------+------+----+----------------+
|Firstname|Lastname|Gender|Age|       Dept|Salary|Rate|Increased Salary|
+---------+--------+------+---+-----------+------+----+----------------+
|    James|   Smith|     M| 30|      Sales|  3000| 1.1|          3300.0|
|     Anna|    Rose|     F| 41|Engineering|  4000| 1.1|          4400.0|
|   Robert|Williams|     M| 62|  Logistics|  5000| 1.1|          5500.0|
+---------+--------+------+---+-----------+------+----+----------------+



In [7]:
# SparkSessionを終了
spark.stop()