### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [None]:
!wget https://users.itk.ppke.hu/~pasda2/Books_rating.csv.zip
!unzip Books_rating.csv.zip
!pip install pyspark
!pip install sparkmeasure

--2024-05-06 13:21:41--  https://users.itk.ppke.hu/~pasda2/Books_rating.csv.zip
Resolving users.itk.ppke.hu (users.itk.ppke.hu)... 193.225.109.33
Connecting to users.itk.ppke.hu (users.itk.ppke.hu)|193.225.109.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1079521208 (1.0G) [application/zip]
Saving to: ‘Books_rating.csv.zip’


2024-05-06 13:22:02 (51.1 MB/s) - ‘Books_rating.csv.zip’ saved [1079521208/1079521208]

Archive:  Books_rating.csv.zip
  inflating: Books_rating.csv        


In [None]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark import SparkConf
conf = SparkConf().setAppName("idk")
spark = SparkSession \
    .builder \
    .config(conf=conf)\
    .config("spark.ui.port", "4050") \
    .config("spark.executor.memoryOverhead", "60g")\
    .config("spark.executor.memory","40g")\
    .getOrCreate()


df=spark.read.csv("Books_rating.csv", header=True, inferSchema=True)
df.printSchema()
#to reduce the size of dataset:
#df=df.limit(50000)
#df.show()
#print((df.count(), len(df.columns)))
import csv
import time
from pyspark.sql.types import *
columns = []
st=time.time()

for i in df.dtypes:
    columns.append(i[0])
print(columns)

# Write a custom function to convert the data type of DataFrame columns
def convertColumn(df, names, newType):
    for name in names:
        df = df.withColumn(name, df[name].cast(newType))
    return df


# Conver the `df` columns to `FloatType()`
df = convertColumn(df, columns, FloatType())

df.fillna(value=0)
df.printSchema()
print(time.time()-st)

from traitlets.traitlets import Float
df=df.fillna(value=0)
df.show()
import numpy as np
from pyspark.sql import functions as F
def convertColumnNeg(df, names):
    for name in names:
        df = df.withColumn(name,F.when(df[name]<0,0).otherwise(F.col(name)))
    return df

df=convertColumnNeg(df,columns)
df=df.replace([np.inf, -np.inf], 0)
print(time.time()-st)


from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
incols=columns
print(incols)
incols.remove("review/score")
incols.remove("review/helpfulness")
assembler = VectorAssembler(inputCols=incols,outputCol="features")
df = assembler.transform(df)
df.printSchema()
final_data = df.select("features", F.col("review/score").alias("score"),F.col("review/helpfulness").alias("label"))
final_data.printSchema()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2024/05/06 13:51:13 NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

root
 |-- Id: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- User_id: string (nullable = true)
 |-- profileName: string (nullable = true)
 |-- review/helpfulness: string (nullable = true)
 |-- review/score: string (nullable = true)
 |-- review/time: string (nullable = true)
 |-- review/summary: string (nullable = true)
 |-- review/text: string (nullable = true)

['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness', 'review/score', 'review/time', 'review/summary', 'review/text']
root
 |-- Id: float (nullable = true)
 |-- Title: float (nullable = true)
 |-- Price: float (nullable = true)
 |-- User_id: float (nullable = true)
 |-- profileName: float (nullable = true)
 |-- review/helpfulness: float (nullable = true)
 |-- review/score: float (nullable = true)
 |-- review/time: float (nullable = true)
 |-- review/summary: float (nullable = true)
 |-- review/text: float (nullable = true)

0.2289867401123047
+---------

In [None]:
spark.sparkContext.getConf().getAll()


[('spark.executor.extraClassPath',
  '/home/datascience/spark_conf_dir/common-jars/*'),
 ('spark.driver.extraClassPath',
  '/home/datascience/spark_conf_dir/common-jars/*'),
 ('spark.sql.hive.metastore.version', '3.1.2'),
 ('spark.sql.warehouse.dir', 'file:/home/datascience/spark-warehouse'),
 ('spark.app.startTime', '1715002670734'),
 ('spark.hadoop.oci.dcat.metastore.create.bucket.per.db', 'false'),
 ('spark.hadoop.hive.stats.autogather', 'false'),
 ('spark.app.name', 'idk'),
 ('spark.hadoop.hive.exec.dynamic.partition.mode', 'dynamic'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.sql.hive.metastore.jars',
  '/home/datascience/spark_conf_dir/datacatalog-metastore-client-jars/integration/datacatalog-metastore-hive-v3-integration-1.0.0-2306052252.jar:/home/datascience/spark_conf_dir/datacatalog-metastore-client-jars/datacatalog-metastore-client-1.0.0-2306052252.jar:/home/datascience/spark_conf_dir/datacatalog-metastore-client-jars/datacatalog-metastore-commons-1.0.0-230605

In [None]:

from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(final_data)

# Print the coefficients and intercept for linear SVC
print("Coefficients: " + str(lsvcModel.coefficients))
print("Intercept: " + str(lsvcModel.intercept))

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(final_data)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# Fit the model
mlrModel = mlr.fit(final_data)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

                                                                                

Py4JJavaError: An error occurred while calling o281.fit.
: java.lang.OutOfMemoryError: Java heap space
	at scala.reflect.ManifestFactory$DoubleManifest.newArray(Manifest.scala:194)
	at scala.reflect.ManifestFactory$DoubleManifest.newArray(Manifest.scala:191)
	at scala.Array$.ofDim(Array.scala:305)
	at org.apache.spark.ml.stat.MultiClassSummarizer.histogram(MultiClassSummarizer.scala:89)
	at org.apache.spark.ml.classification.LogisticRegression.$anonfun$train$1(LogisticRegression.scala:514)
	at org.apache.spark.ml.classification.LogisticRegression$$Lambda$2934/1250704889.apply(Unknown Source)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at org.apache.spark.ml.util.Instrumentation$$$Lambda$2935/2013917463.apply(Unknown Source)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:495)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:286)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
