In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
#sparksession 드라이버 프로세스 얻기
spark = SparkSession.builder.appName("sample").master("local[*]").getOrCreate()
#클러스터모드의 경우 master에 local[*] 대신 yarn이 들어간다.
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)
#jupyter환경에서만 가능한 config, .show()메소드를 사용할 필요없이 dataframe만 실행해도,정렬된 프린팅을 해준다.

In [2]:
csv_file_path = "C:/Users/imfks/Downloads/products.csv"
df = spark.read.csv(csv_file_path, header=True) 

In [3]:
df.printSchema

<bound method DataFrame.printSchema of +---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
|  6|            Yoga Mat|         Sports|      30| 29.99|
|  7| Samsung 4K Smart TV|    Electronics|       8|799.99|
|  8|        Levi's Jeans|       Clothing|      15| 49.99|
|  9|Dyson Vacuum Cleaner|Home Appliances|       3|399.99|
| 10| Harry Potter Series|          Books|      20| 15.99|
| 11|        MAC Lipstick|         Beauty|      75| 16.99|
| 12|Adidas Running Shoes|         Sports|      22| 59.99|
| 13|       PlayStation 5|    Electronics|      12|499.99|
| 14|   Hooded Sw

In [4]:
# import necessary types
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [5]:
# Define the schema
schema = StructType([
    StructField(name="id", dataType=IntegerType(), nullable=True),
    StructField(name="name", dataType=StringType(), nullable=True),
    StructField(name="category", dataType=StringType(), nullable=True),
    StructField(name="quantity", dataType=IntegerType(), nullable=True),
    StructField(name="price", dataType=DoubleType(), nullable=True)
])

In [7]:
csv_file_path = "C:/Users/imfks/Downloads/products.csv"
df = spark.read.csv(csv_file_path, header=True, schema=schema) 

In [8]:
df.printSchema

<bound method DataFrame.printSchema of +---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
|  6|            Yoga Mat|         Sports|      30| 29.99|
|  7| Samsung 4K Smart TV|    Electronics|       8|799.99|
|  8|        Levi's Jeans|       Clothing|      15| 49.99|
|  9|Dyson Vacuum Cleaner|Home Appliances|       3|399.99|
| 10| Harry Potter Series|          Books|      20| 15.99|
| 11|        MAC Lipstick|         Beauty|      75| 16.99|
| 12|Adidas Running Shoes|         Sports|      22| 59.99|
| 13|       PlayStation 5|    Electronics|      12|499.99|
| 14|   Hooded Sw

In [10]:
json_file_path = "C:/Users/imfks/Downloads/products_singleline.json"
df = spark.read.json(json_file_path)

In [12]:
df.printSchema
df.show(5)

+---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
+---------------+---+--------------------+------+--------+
only showing top 5 rows



In [13]:
json_file_path = "C:/Users/imfks/Downloads/products_multiline.json"
df = spark.read.json(json_file_path, multiLine=True)

df.printSchema

<bound method DataFrame.printSchema of +---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
|         Sports|  6|            Yoga Mat| 29.99|      30|
|    Electronics|  7| Samsung 4K Smart TV|799.99|       8|
|       Clothing|  8|        Levi's Jeans| 49.99|      15|
|Home Appliances|  9|Dyson Vacuum Cleaner|399.99|       3|
|          Books| 10| Harry Potter Series| 15.99|      20|
|         Beauty| 11|        MAC Lipstick| 16.99|      75|
|         Sports| 12|Adidas Running Shoes| 59.99|      22|
|    Electronics| 13|       PlayStation 5|499.99|      12|
|       Clothing|