In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .enableHiveSupport() \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# load txt to table

In [2]:
spark.sql("create table if not exists src (key int, value string) using hive")
spark.sql("load data local inpath '/opt/spark/examples/src/main/resources/kv1.txt' into table src")

23/11/28 03:48:05 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/11/28 03:48:05 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


DataFrame[]

23/11/28 03:48:07 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


DataFrame[]

# load json 

In [3]:
df = spark.read.json("file:///opt/spark/examples/src/main/resources/people.json")
df.show()

                                                                                

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [4]:
df.printSchema()
df.select("name").show()
df.select(df['name'], df['age'] + 1).show()
df.filter(df['age'] > 21).show()
df.groupBy("age").count().show()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



                                                                                

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     NULL|
|   Andy|       31|
| Justin|       20|
+-------+---------+

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|NULL|    1|
|  30|    1|
+----+-----+



In [5]:
df.createOrReplaceTempView("people")

sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

# spark.newSession().sql("SELECT * FROM global_temp.people").show()  # 会报错

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [6]:
df.createOrReplaceGlobalTempView("people")

sqlDF = spark.sql("SELECT * FROM people").show()

spark.newSession().sql("SELECT * FROM global_temp.people").show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [7]:
spark.stop()

# addfile

In [8]:
from pyspark import SparkContext, SparkFiles

tfile = './file.ipynb'

# 初始化 SparkContext
sc = SparkContext("local", "AddFileExample")

# 添加文件到所有节点
sc.addFile(tfile)

# 获取文件
SparkFiles.get(tfile)

sc.stop()

'/tmp/spark-f7209b2c-e38f-4660-98ab-d017d2a1371d/userFiles-136b0d25-5707-4ff5-bc2d-4d4580492fa5/file.ipynb'

# save table

In [1]:
from pyspark.sql.functions import split
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("savetable") \
    .config("spark.some.config.option", "some-value") \
    .enableHiveSupport() \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/04 09:17:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
adj = spark.read.text('file:///home/jupyter/data/test/adj_rs.csv')

In [3]:

columns = ['time', 'app_id', 'store', 'adid', 'openid', 'activity_kind', 'created_at', 'installed_at', 'reattributed_at', 'network_name', 'country', 'device_name', 'device_type', 'os_name', 'timezone', 'event_name', 'revenue_float', 'revenue', 'currency', 'revenue_usd', 'reporting_revenue']
adj_split = adj.select(split(adj['value'], ',').alias('split'))
adj_final = adj_split.select([adj_split['split'][i].alias(columns[i]) for i in range(len(columns))])

In [4]:
adj_final.write.saveAsTable('adj', mode='overwrite')

23/12/04 09:18:01 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/12/04 09:18:01 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/12/04 09:18:08 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
23/12/04 09:18:08 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
23/12/04 09:18:08 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/12/04 09:18:08 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/12/04 09:18:08 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


# table

In [13]:
myadj = spark.table('adj')
myadj.show(1, vertical=True)

-RECORD 0---------------------------------
 time              | 2023-10-01 00:00:00  
 app_id            | 1456241577           
 store             | itunes               
 adid              | 041bf78c9dc6dd5f5... 
 openid            |                      
 activity_kind     | session              
 created_at        |                      
 installed_at      | 1636532102           
 reattributed_at   |                      
 network_name      | RWD-ady              
 country           | jp                   
 device_name       |                      
 device_type       |                      
 os_name           | ios                  
 timezone          | UTC+0900             
 event_name        |                      
 revenue_float     |                      
 revenue           |                      
 currency          |                      
 revenue_usd       |                      
 reporting_revenue |                      
only showing top 1 row



In [14]:
spark.stop()