### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import ads
ads.set_auth("resource_principal") 

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Spark Snowflake to OCI") \
        .config('spark.jars.packages', 'net.snowflake:spark-snowflake_2.12:2.14.0-spark_3.2') \
        .config('spark.sql.execution.arrow.pyspark.enabled', 'true') \
        .enableHiveSupport() \
        .getOrCreate()

namespace = 'fro8fl9kuqli'
bucket = 'snowflake'
folder = 'electrodata'

output_path='oci://'+bucket+'@'+namespace+'/'+folder

print(output_path)

In [3]:
#print(spark.sparkContext._jsc.sc().listJars())
print(spark.sparkContext.defaultParallelism)

2


In [None]:
from pyspark.sql.functions import col


st = (spark.read
  .format("snowflake")
  .option("dbtable", "tabLE")
  .option("sfUrl", "XXXXXXX.snowflakecomputing.com")
  .option("sfUser", "TEST")
  .option("sfPassword", "PASS")
  .option("sfDatabase", "TEST")
  .option("sfSchema", "TEST")
  .option("sfWarehouse", "COMPUTE_WH")
  .load()
).count()


In [5]:
print(st)

695


In [14]:
##inputpath='oci://'+bucket+'@'+namespace+'/ele_sample.json'

inoutpath = f'oci://streamdata@fro8fl9kuqli/ele_sample.json'
print(inputpath)

example = spark.read.json(inputpath)
schema = example.schema.json()
example.printSchema()
# print(schema)
# example.show()

oci://streamdata@fro8fl9kuqli/ele_sample.json
root
 |-- phase: string (nullable = true)
 |-- times: string (nullable = true)
 |-- topik: string (nullable = true)
 |-- v: double (nullable = true)



In [None]:
from pyspark.sql.functions import expr, from_json, col, concat, explode, year, month, date_format, to_timestamp
from pyspark.sql import functions as F


# Reading raw Kafka stream.
kafka = spark.readStream.format('kafka').options(**raw_kafka_options).load() \
    .select(from_json(col("value").cast("string"), schema).alias("data"), col("timestamp").cast("timestamp").alias("kafka_time"),col("offset").cast("int").alias("offset"))

#kafka.printSchema()

df = kafka \
    .select("*") \
    .select("data.*","kafka_time","offset") \
    .select("kafka_time","offset","phase","v","times","topik") \
    .withColumn('year',year("times")) \
    .withColumn('month',date_format('times', 'yyyy-MM')) \
    .withColumn('day',date_format('times', 'yyyy-MM-dd')) \
    .withColumn('hour',date_format('times', 'HH')) \
    .withColumn("times",to_timestamp("times"))


df.printSchema()


query = df \
    .writeStream \
    .trigger(processingTime='1 minute') \
    .foreachBatch(lambda df, epoch_id: df.write.mode("append").partitionBy("year","month","day").parquet(f"{output_path}")) \
    .start()

query.awaitTermination(200)

In [20]:
#query.stop()
query.stop()

In [24]:
dl = spark.read.option("basePath", output_path).parquet(output_path+"/year=*/month=*/day=2024-05-23/*.parquet")
#dl.printSchema()
dl.count()

                                                                                

15897

In [28]:
#dl.createOrReplaceTempView("eletro")

sql="select hour, sum(v) from eletro group by hour order by 1"
spark.sql(sql).show()



+----+-------------------+
|hour|             sum(v)|
+----+-------------------+
|  00| 115964.67999999959|
|  01|  123614.4700000002|
|  02| 128611.23999999996|
|  03| 160143.60000000024|
|  04|  253690.7400000005|
|  05| 153279.22999999995|
|  06| -71796.88999999997|
|  07|-11835.169999999998|
+----+-------------------+



                                                                                

In [None]:
sql_str="select day, action, count(*)" \
 " from bal_tra "  \
 " group by day, action"

# # Execute SQL
dc = spark.sql(sql_str)
dc.show()

In [None]:
bigbet.count()

In [None]:
from pyspark.sql.functions import rand, sum,avg,max,count
dl.groupBy("action").agg(count("*")).orderBy("action").show()

In [None]:
dl.count()

In [36]:
dl = spark.read.option("basePath", output_path).parquet(output_path+"/*.parquet") 
# \
#         .filter(col("day")=='2024-04-22').filter(col("hour")=="9")

dl.count()

dl.show()

[Stage 68:>                                                         (0 + 1) / 1]

+------------+----------+-------------+--------------+-------------+----------+----------+
|WINDOW_START|WINDOW_END|LASTTIMESTAMP|FIRSTTIMESTAMP|DEPOSIT_COUNT|MAXDEPOSIT|MINDEPOSIT|
+------------+----------+-------------+--------------+-------------+----------+----------+
+------------+----------+-------------+--------------+-------------+----------+----------+



                                                                                

In [None]:
dl.createOrReplaceTempView("dl")

sql="select * from bal_tra"
#spark.sql(sql).show(2)


sql_str="select day, count(*)" \
 " from dl "  \
 " group by day"

# # Execute SQL
dc = spark.sql(sql_str)
dc.show()



In [None]:
import pyspark.pandas as ps
from autovizwidget.widget.utils import display_dataframe

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

pdc = dc.toPandas()
display_dataframe(pdc)

In [None]:
from pyspark.sql import functions as F

inter = (dl.filter(col("action")=='deposit')
    .withWatermark("Created_at", "1 minute")
    .groupBy(['action', F.window('Created_at', '1 minute')])
    .agg(
         F.expr("max_by(amount_cents, Created_at)").alias('lastvalue'),
         F.expr("min_by(amount_cents, Created_at)").alias('firstvalue'),
         F.max('Created_at').alias('lastTimeStamp'),
         F.count('Created_at').alias('trans_qty'),
         F.min('Created_at').alias('firstTimeStamp'),
         F.max('amount_cents').alias('MaxBet'),
         F.min('amount_cents').alias('MinBet'),
    )
).orderBy("lastTimeStamp", ascending=False)


inter.show(truncate=False)

In [None]:
inter.count()

In [None]:
spark.stop()

In [None]:
%load_ext dataflow.magics

In [None]:
import requests
TOKEN = "6671512971:AAEjIUEFxAcuK5pCl0EinBm8MDQ-s0csDl8"
# url = f"https://api.telegram.org/bot{TOKEN}/getUpdates"
# print(requests.get(url).json())

chat_id = "844904100"
message = "OCI Python can send a message to your telegram chat!"
url = f"https://api.telegram.org/bot{TOKEN}/sendMessage?chat_id={chat_id}&text={message}"
print(requests.get(url).json()) # this sends the message

In [None]:
import requests

def send_telegram_message(token, chat_id, message):
    url = f"https://api.telegram.org/bot{token}/sendMessage"
    payload = {
        'chat_id': chat_id,
        'text': message
        }
    response = requests.post(url, data=payload)
    return response.json()

# Example usage:
bot_token = '6671512971:AAEjIUEFxAcuK5pCl0EinBm8MDQ-s0csDl8'
chat_id = '844904100'
message = 'Hello message nr 5!'

a = send_telegram_message(bot_token, chat_id, message)
#print(a['ok'])
print(a)

In [None]:
ds = spark.read.format("oracle") \
    .option("walletUri","oci://dataflow_app@"+namespace+"/Adw_Forza_wallet.zip") \
    .option("connectionId","db201909271450_high") \
    .option("query", "select * from car.kafka_stream_dv") \
    .option("user", "CAR")\
    .option("password", "WelcomeBack123#")\
    .load()

