## part i: sparkML

In [None]:
# import my libraries + functions
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# build session, give my app a name
spark = SparkSession.builder.appName("ML Penguins").getOrCreate()

In [None]:
# get palmer penguins data
!wget https://raw.githubusercontent.com/mcnakhaee/palmerpenguins/master/palmerpenguins/data/penguins-raw.csv

--2024-03-20 18:10:59--  https://raw.githubusercontent.com/mcnakhaee/palmerpenguins/master/palmerpenguins/data/penguins-raw.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53098 (52K) [text/plain]
Saving to: ‘penguins-raw.csv’


2024-03-20 18:11:00 (6.23 MB/s) - ‘penguins-raw.csv’ saved [53098/53098]



In [None]:
# check that csv made it to local file system
!ls

azure	   ganglia		       metastore_db
conf	   hadoop_accessed_config.lst  penguins-raw.csv
eventlogs  logs			       preload_class.lst


In [None]:
# upload copy of data to DISTRIBUTED file system - let's make partitions!
dbutils.fs.cp('file:/databricks/driver/penguins-raw.csv', 'dbfs:/FileStore/tables')

Out[31]: True

In [None]:
# read into spark dataframe
penguins = spark.read.csv('dbfs:/FileStore/tables/penguins-raw.csv', header=True, inferSchema=True)

In [None]:
# show me my first 5 rows
penguins.show(5, truncate=True)

+---------+-------------+--------------------+------+---------+------------------+-------------+-----------------+----------+------------------+-----------------+-------------------+-------------+------+-----------------+-----------------+--------------------+
|studyName|Sample Number|             Species|Region|   Island|             Stage|Individual ID|Clutch Completion|  Date Egg|Culmen Length (mm)|Culmen Depth (mm)|Flipper Length (mm)|Body Mass (g)|   Sex|Delta 15 N (o/oo)|Delta 13 C (o/oo)|            Comments|
+---------+-------------+--------------------+------+---------+------------------+-------------+-----------------+----------+------------------+-----------------+-------------------+-------------+------+-----------------+-----------------+--------------------+
|  PAL0708|            1|Adelie Penguin (P...|Anvers|Torgersen|Adult, 1 Egg Stage|         N1A1|              Yes|2007-11-11|              39.1|             18.7|                181|         3750|  MALE|              

In [None]:
#check on the schema
penguins.printSchema()

root
 |-- studyName: string (nullable = true)
 |-- Sample Number: integer (nullable = true)
 |-- Species: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Island: string (nullable = true)
 |-- Stage: string (nullable = true)
 |-- Individual ID: string (nullable = true)
 |-- Clutch Completion: string (nullable = true)
 |-- Date Egg: date (nullable = true)
 |-- Culmen Length (mm): string (nullable = true)
 |-- Culmen Depth (mm): string (nullable = true)
 |-- Flipper Length (mm): string (nullable = true)
 |-- Body Mass (g): string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Delta 15 N (o/oo): string (nullable = true)
 |-- Delta 13 C (o/oo): string (nullable = true)
 |-- Comments: string (nullable = true)



In [None]:
# change some columns names, legibility
penguins = penguins.withColumnRenamed("Culmen Length (mm)", "Bill Length (mm)")
penguins = penguins.withColumnRenamed("Culmen Depth (mm)", "Bill Depth (mm)")

In [None]:

# hmm ... i need computation, not strings
# change a list of columns to number data type (double, with decimal point values)

columns_to_number = ["Bill Length (mm)", "Bill Depth (mm)", "Flipper Length (mm)", "Body Mass (g)"]

# loop through list to cast as double
for column in columns_to_number:
    penguins = penguins.withColumn(column, penguins[column].cast("double"))

# check on schema
penguins.printSchema()


root
 |-- studyName: string (nullable = true)
 |-- Sample Number: integer (nullable = true)
 |-- Species: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Island: string (nullable = true)
 |-- Stage: string (nullable = true)
 |-- Individual ID: string (nullable = true)
 |-- Clutch Completion: string (nullable = true)
 |-- Date Egg: date (nullable = true)
 |-- Bill Length (mm): double (nullable = true)
 |-- Bill Depth (mm): double (nullable = true)
 |-- Flipper Length (mm): double (nullable = true)
 |-- Body Mass (g): double (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Delta 15 N (o/oo): string (nullable = true)
 |-- Delta 13 C (o/oo): string (nullable = true)
 |-- Comments: string (nullable = true)



In [None]:
# count my rows
penguins.count()

Out[11]: 344

In [None]:
# ML is not going to like any null, empty values - drop these
penguins = penguins.dropna(subset=columns_to_number, how='any')
# and see how many were dropped by comparing the counts
penguins.count()

Out[12]: 342

In [None]:
# first step in my pipeline: preprocess, Indexer
# ML doesn't understand words! needs -> numbers
# turn "Species" categories into distinct number values (indicies) with an indexer

indexer = StringIndexer(inputCol="Species", outputCol="index")
indexerModel = indexer.fit(penguins)
penguins_transformed = indexerModel.transform(penguins)

In [None]:
# check count of species
penguins_transformed.select("Species").groupBy("Species").count().orderBy("count", ascending=False).show()

+--------------------+-----+
|             Species|count|
+--------------------+-----+
|Adelie Penguin (P...|  151|
|Gentoo penguin (P...|  123|
|Chinstrap penguin...|   68|
+--------------------+-----+



In [None]:
# check count of "index" on transformed dataframe ... this looks the same!
penguins_transformed.select("index").groupBy("index").count().orderBy("count", ascending=False).show()

+-----+-----+
|index|count|
+-----+-----+
|  0.0|  151|
|  1.0|  123|
|  2.0|   68|
+-----+-----+



In [None]:
# next step in pipeline: vectorize
# put all relevant number columns into single vector in single column, Assembler

# in this example, I want data on Bill Length, Bill Depth, Flipper Length, and Body Mass to give me
# predictions about Species category

assembler = VectorAssembler(inputCols=columns_to_number, outputCol="features")
penguins_transformed = assembler.transform(penguins_transformed)

In [None]:
# show me the vector column!
# it should be a kind of "summary" of all the other columns
penguins_transformed.select("Bill Length (mm)", "Bill Depth (mm)", "Flipper Length (mm)", "Body Mass (g)", "features").show(3, truncate=False)

+----------------+---------------+-------------------+-------------+------------------------+
|Bill Length (mm)|Bill Depth (mm)|Flipper Length (mm)|Body Mass (g)|features                |
+----------------+---------------+-------------------+-------------+------------------------+
|39.1            |18.7           |181.0              |3750.0       |[39.1,18.7,181.0,3750.0]|
|39.5            |17.4           |186.0              |3800.0       |[39.5,17.4,186.0,3800.0]|
|40.3            |18.0           |195.0              |3250.0       |[40.3,18.0,195.0,3250.0]|
+----------------+---------------+-------------------+-------------+------------------------+
only showing top 3 rows



In [None]:
# next step in our pipeline: Classifier, LogisticRegression
# PySpark automatically makes this multinomial logistic regression (multiple categories), if we need

classifier = LogisticRegression(featuresCol="features", labelCol="index")

LOGISTIC REGRESSION generally (not a plot of this example!) <br>
looking for the lines/regions to make our classification from features <br>
<img src="https://i.stack.imgur.com/XxkK3.png">

In [None]:
# string all 3 stages (Indexer, Assembler, Classifier) into a PIPELINE
# this is reusable!

pipeline = Pipeline(stages=[indexer, assembler, classifier])

In [None]:
# split our dataset randomly into 70% training, 30% testing

training, testing = penguins.randomSplit([0.7, 0.3], seed=999)

In [None]:
# fit the model to the training data
model = pipeline.fit(training)

In [None]:
# run the model on the testing data to get predictions of species
predictions = model.transform(testing)
predictions.select("prediction", "index", "Species").show(truncate=True)

+----------+-----+--------------------+
|prediction|index|             Species|
+----------+-----+--------------------+
|       0.0|  0.0|Adelie Penguin (P...|
|       2.0|  2.0|Chinstrap penguin...|
|       1.0|  1.0|Gentoo penguin (P...|
|       1.0|  1.0|Gentoo penguin (P...|
|       1.0|  1.0|Gentoo penguin (P...|
|       2.0|  2.0|Chinstrap penguin...|
|       1.0|  1.0|Gentoo penguin (P...|
|       1.0|  1.0|Gentoo penguin (P...|
|       1.0|  1.0|Gentoo penguin (P...|
|       0.0|  2.0|Chinstrap penguin...|
|       1.0|  1.0|Gentoo penguin (P...|
|       2.0|  2.0|Chinstrap penguin...|
|       1.0|  1.0|Gentoo penguin (P...|
|       0.0|  0.0|Adelie Penguin (P...|
|       2.0|  2.0|Chinstrap penguin...|
|       2.0|  2.0|Chinstrap penguin...|
|       0.0|  0.0|Adelie Penguin (P...|
|       2.0|  2.0|Chinstrap penguin...|
|       1.0|  1.0|Gentoo penguin (P...|
|       2.0|  2.0|Chinstrap penguin...|
+----------+-----+--------------------+
only showing top 20 rows



In [None]:
# basic evaluation of model: # of correct predictions / total # predictions
evaluator = MulticlassClassificationEvaluator(labelCol="index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy =", accuracy)

Accuracy = 0.9814814814814815


In [None]:
# write + persist the model
model.write().overwrite().save("penguin-model")
# look for this in DBFS

In [None]:
# load the model
loaded_model = PipelineModel.load("dbfs:/penguin-model")

In [None]:
# use the model again
# (different testing data, fine-tuning to fit again with different training data, etc.)

predictions_part2 = loaded_model.transform(testing);
predictions_part2.select("features", "Species", "index", "prediction").show(15, truncate=False)

# remember features are in this order:
# "Bill Length (mm)", "Bill Depth (mm)", "Flipper Length (mm)", "Body Mass (g)"

+------------------------+-----------------------------------------+-----+----------+
|features                |Species                                  |index|prediction|
+------------------------+-----------------------------------------+-----+----------+
|[39.1,18.7,181.0,3750.0]|Adelie Penguin (Pygoscelis adeliae)      |0.0  |0.0       |
|[46.5,17.9,192.0,3500.0]|Chinstrap penguin (Pygoscelis antarctica)|2.0  |2.0       |
|[48.7,14.1,210.0,4450.0]|Gentoo penguin (Pygoscelis papua)        |1.0  |1.0       |
|[50.0,15.2,218.0,5700.0]|Gentoo penguin (Pygoscelis papua)        |1.0  |1.0       |
|[47.6,14.5,215.0,5400.0]|Gentoo penguin (Pygoscelis papua)        |1.0  |1.0       |
|[45.2,17.8,198.0,3950.0]|Chinstrap penguin (Pygoscelis antarctica)|2.0  |2.0       |
|[46.5,13.5,210.0,4550.0]|Gentoo penguin (Pygoscelis papua)        |1.0  |1.0       |
|[45.4,14.6,211.0,4800.0]|Gentoo penguin (Pygoscelis papua)        |1.0  |1.0       |
|[46.7,15.3,219.0,5200.0]|Gentoo penguin (Pygoscelis p

## part ii: transformers

In [None]:
# first step: CONNECT -> T4 GPU!
# then run below to install pyTorch

!pip install torch torchvision -U
# this might take a minute ...



In [None]:
from transformers import pipeline

In [None]:
# choose pipeline! we'll start with zero-shot classification

classifier = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# define your input text and possible labels (classes)
input_text = "Astronomy is the study of stars and planets."
possible_labels = ["Science", "History", "Sports"]

In [None]:
# run the pipeline stages and get result
result = classifier(input_text, possible_labels)

In [None]:
# nicely formatted result
print("Input Text:", input_text)
print("Predicted Class:", result["labels"][0])
print("Confidence Score:", result["scores"][0])

Input Text: Astronomy is the study of stars and planets.
Predicted Class: Science
Confidence Score: 0.6215195059776306


In [None]:
# print raw lists of labels and scores outputs
print(result["labels"])
print(result["scores"])

['Science', 'History', 'Sports']
[0.6215195059776306, 0.19454991817474365, 0.18393057584762573]


In [None]:
# choose a different model for the same pipeline task
# models from hugging face: https://huggingface.co/models

classifier_m = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli")

config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/870M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/395 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [None]:
result = classifier_m(input_text, possible_labels)

In [None]:
print(result["labels"])
print(result["scores"])

# same scores or different scores as previous model?

['Science', 'Sports', 'History']
[0.9721117615699768, 0.014024194329977036, 0.013864022679626942]


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

In [None]:
# looking under the hood a little bit ... the config of the model
config = AutoConfig.from_pretrained("MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli")
print(config)

DebertaV2Config {
  "_name_or_path": "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "entailment",
    "1": "neutral",
    "2": "contradiction"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dty

In [None]:
print("Hidden size:", config.hidden_size)
print("Number of attention heads:", config.num_attention_heads)
print("Vocabulary size:", config.vocab_size)

Hidden size: 1024
Number of attention heads: 16
Vocabulary size: 128100


In [None]:
# just going through Tokenizer stage in the pipeline
tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli")

In [None]:
tokens = tokenizer.tokenize("Astronomy is the study of stars and planets.")
print(tokens)

['▁Astronomy', '▁is', '▁the', '▁study', '▁of', '▁stars', '▁and', '▁planets', '.']


In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[34782, 269, 262, 797, 265, 2906, 263, 12658, 260]


In [None]:
response = tokenizer("Astronomy is the study of stars and planets.")
print(response)

{'input_ids': [1, 34782, 269, 262, 797, 265, 2906, 263, 12658, 260, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
proton_result = classifier_m("Proton is a great cat, he just gets a little out of control.", ["pets", "stories", "physics", "cooking", "trash"])

In [None]:
print(proton_result)

{'sequence': 'Proton is a great cat, he just gets a little out of control.', 'labels': ['pets', 'stories', 'trash', 'physics', 'cooking'], 'scores': [0.9938209056854248, 0.005064066965132952, 0.0004319684230722487, 0.00036989827640354633, 0.0003131227276753634]}


In [None]:
for i in range(len(proton_result["scores"])):
    print(proton_result["labels"][i] + ": " + str(proton_result["scores"][i] * 100) + "%")

pets: 99.38209056854248%
stories: 0.5064066965132952%
trash: 0.04319684230722487%
physics: 0.03698982764035463%
cooking: 0.03131227276753634%


## part iii: experiment w/transformers pipelines

In [None]:
# repeat the cells above with the zero-shot classification pipeline with your own sequence (sentence of text) and list of categories



In [None]:
# now choose a different pipeline task!
# full list here (many are NLP, but they are not all text-based ...)
# https://huggingface.co/docs/transformers/en/main_classes/pipelines

# you may want to check out the tutorial-style documentation with examples
# on this page: https://huggingface.co/learn/nlp-course/chapter1/3

In [None]:
# run at first with the default model, but then you can choose a different model
# and compare results

# list of models here: https://huggingface.co/models
# (many are open sourced; if 1 that you choose asks you to add API key, look for 1 that does not)

# many models also show example code of how to work with pipeline abstraction

In [None]:
# check out another pipeline task + model, as time permits



In [None]:
# watch the video on Hugging Face "Spaces," here: https://huggingface.co/spaces/launch



In [None]:
# is there a "Space" (deployed transformer app) that was exciting to you?
# play with Spaces apps here : https://huggingface.co/spaces



In [None]:
# what is going on here in terms of ABSTRACTION? what is being "abstracted" away from us? pros/cons?



In [None]:
# how are your feelings about this tempered by the readings for this week?



citations:

some codes/methods inspired by
*   [Medium TheDataScience-ProF, "Zero-Shot Classification Using Transformers"](https://medium.com/@TheDataScience-ProF/zero-shot-classification-using-transformers-unlocking-the-power-of-ai-for-text-based-tasks-e5118398ef17)
*   [Getting Started With Hugging Face](https://www.youtube.com/watch?v=QEaBAZQCtwE)
[link text](https://)
