<a href="https://colab.research.google.com/github/nandini-mazumdar/learning-spark-again/blob/main/Spark_ML_AI_Dan_Sullivan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

# Normalization

### MinMaxScaler - attribute values 0 to 1

In [None]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

In [None]:
feature_df = spark.createDataFrame([(1, Vectors.dense([10.0, 10000.0, 1.0])),(2, Vectors.dense([20.0, 30000.0, 2.0])),(3, Vectors.dense([30.0, 40000.0, 3.0]))], ["id", "features"])

In [None]:
feature_df.take(2)

[Row(id=1, features=DenseVector([10.0, 10000.0, 1.0])),
 Row(id=2, features=DenseVector([20.0, 30000.0, 2.0]))]

In [None]:
# create scaler object to transform
feature_scaler = MinMaxScaler(inputCol="features", outputCol="sFeatures")

# transform + fit the vectors into scaled version
smodel = feature_scaler.fit(feature_df)

# apply transformation to created scaled dataset
sFeatures_df = smodel.transform(feature_df)

In [None]:
sFeatures_df.take(2)

[Row(id=1, features=DenseVector([10.0, 10000.0, 1.0]), sFeatures=SparseVector(3, {})),
 Row(id=2, features=DenseVector([20.0, 30000.0, 2.0]), sFeatures=DenseVector([0.5, 0.6667, 0.5]))]

In [None]:
sFeatures_df.select("features", "sFeatures").show()

+------------------+--------------------+
|          features|           sFeatures|
+------------------+--------------------+
|[10.0,10000.0,1.0]|           (3,[],[])|
|[20.0,30000.0,2.0]|[0.5,0.6666666666...|
|[30.0,40000.0,3.0]|       [1.0,1.0,1.0]|
+------------------+--------------------+



# Standardization

### StandardScaler - attribute values range -1 to 1, mean = 0, normal dist

In [None]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors

In [None]:
features_df = spark.createDataFrame([(1, Vectors.dense([10.0, 10000.0, 1.0]),),(2, Vectors.dense([20.0, 30000.0, 3.0]),),(3, Vectors.dense([30.0, 40000.0, 3.0]))], ["id", "features"])
features_df.take(3)

[Row(id=1, features=DenseVector([10.0, 10000.0, 1.0])),
 Row(id=2, features=DenseVector([20.0, 30000.0, 3.0])),
 Row(id=3, features=DenseVector([30.0, 40000.0, 3.0]))]

In [None]:
# create standardized object
feature_stand_scaler = StandardScaler(inputCol = "features", outputCol = "stand_features", withStd=True, withMean=True)

In [None]:
# build standardize model
stand_model = feature_stand_scaler.fit(features_df)

In [None]:
# apply transformation
st_features_df = stand_model.transform(features_df)

In [None]:
st_features_df.take(3)

[Row(id=1, features=DenseVector([10.0, 10000.0, 1.0]), stand_features=DenseVector([-1.0, -1.0911, -1.1547])),
 Row(id=2, features=DenseVector([20.0, 30000.0, 3.0]), stand_features=DenseVector([0.0, 0.2182, 0.5774])),
 Row(id=3, features=DenseVector([30.0, 40000.0, 3.0]), stand_features=DenseVector([1.0, 0.8729, 0.5774]))]

In [None]:
st_features_df.show()

+---+------------------+--------------------+
| id|          features|      stand_features|
+---+------------------+--------------------+
|  1|[10.0,10000.0,1.0]|[-1.0,-1.09108945...|
|  2|[20.0,30000.0,3.0]|[0.0,0.2182178902...|
|  3|[30.0,40000.0,3.0]|[1.0,0.8728715609...|
+---+------------------+--------------------+



# Bucketize or Partition

### Absolute Binning of attributes

In [None]:
from pyspark.ml.feature import Bucketizer

In [None]:
# boundaries for bins
splits = [-float("inf"), -10.0, 0.0, 10.0, float("inf")]

In [None]:
unbinned_data = [(-800.0,),(-10.5,),(-1.7,),(0.0,),(8.2,),(90.1,)]

In [None]:
unbinned_data

[(-800.0,), (-10.5,), (-1.7,), (0.0,), (8.2,), (90.1,)]

In [None]:
binned_df = spark.createDataFrame(unbinned_data, ["features"])
binned_df.show()

+--------+
|features|
+--------+
|  -800.0|
|   -10.5|
|    -1.7|
|     0.0|
|     8.2|
|    90.1|
|   103.4|
+--------+



In [None]:
splits

[-inf, -10.0, 0.0, 10.0, inf]

In [None]:
# create bucketizer/ slicer object
bucketizer = Bucketizer(splits= splits, inputCol="features", outputCol="bFeatures")

# apply tranformation
bucketized_df = bucketizer.transform(binned_df)
bucketized_df.show()

+--------+---------+
|features|bFeatures|
+--------+---------+
|  -800.0|      0.0|
|   -10.5|      0.0|
|    -1.7|      1.0|
|     0.0|      2.0|
|     8.2|      2.0|
|    90.1|      3.0|
|   103.4|      3.0|
+--------+---------+



# Tokenize Text Data

In [None]:
from pyspark.ml.feature import Tokenizer

In [None]:
sentances_df = spark.createDataFrame([(1, "This is an introduction to Spark MLlib"), (2, "MLlib includes libraries for classification and regression"), (3,"It also contains supporting tools for pipelines")], ["id", "sentance"])

sentances_df.take(3)

[Row(id=1, sentance='This is an introduction to Spark MLlib'),
 Row(id=2, sentance='MLlib includes libraries for classification and regression'),
 Row(id=3, sentance='It also contains supporting tools for pipelines')]

In [None]:
# create Tokenizer object
sent_token = Tokenizer(inputCol="sentance", outputCol="words")

In [None]:
# apply transform to dataset
sent_tokenized_df = sent_token.transform(sentances_df)
sent_tokenized_df

id,sentance,words
1,This is an introd...,"[this, is, an, in..."
2,MLlib includes li...,"[mllib, includes,..."
3,It also contains ...,"[it, also, contai..."


In [None]:
sent_tokenized_df.show()

+---+--------------------+--------------------+
| id|            sentance|               words|
+---+--------------------+--------------------+
|  1|This is an introd...|[this, is, an, in...|
|  2|MLlib includes li...|[mllib, includes,...|
|  3|It also contains ...|[it, also, contai...|
+---+--------------------+--------------------+



# Term Frequency - Inverse Document Frequency
### TF-IDF

In [None]:
from pyspark.ml.feature import HashingTF, IDF

In [None]:
sentances_df.take(1)

[Row(id=1, sentance='This is an introduction to Spark MLlib')]

In [None]:
sent_tokenized_df.take(1)

[Row(id=1, sentance='This is an introduction to Spark MLlib', words=['this', 'is', 'an', 'introduction', 'to', 'spark', 'mllib'])]

# Hashing TF

In [None]:
hashing_df = HashingTF(inputCol="words",outputCol="raw_features",numFeatures=20)

In [None]:
sent_hashingTF = hashing_df.transform(sent_tokenized_df)

In [None]:
sent_hashingTF.take(1)

[Row(id=1, sentance='This is an introduction to Spark MLlib', words=['this', 'is', 'an', 'introduction', 'to', 'spark', 'mllib'], raw_features=SparseVector(20, {6: 2.0, 8: 1.0, 9: 1.0, 10: 1.0, 13: 1.0, 15: 1.0}))]

In [None]:
# scale the df acc to freq of words
idf = IDF(inputCol="raw_features", outputCol="IDF_features")

In [None]:
# fit
idf_model = idf.fit(sent_hashingTF)

In [None]:
# apply transformation and get new idf_features column
tf_idf_df = idf_model.transform(sent_hashingTF)

In [None]:
tf_idf_df.take(1)

[Row(id=1, sentance='This is an introduction to Spark MLlib', words=['this', 'is', 'an', 'introduction', 'to', 'spark', 'mllib'], raw_features=SparseVector(20, {6: 2.0, 8: 1.0, 9: 1.0, 10: 1.0, 13: 1.0, 15: 1.0}), IDF_features=SparseVector(20, {6: 0.5754, 8: 0.2877, 9: 0.6931, 10: 0.6931, 13: 0.6931, 15: 0.2877}))]

# Refer Intro to Spark-Chap 5 to continue to ML tasks
https://colab.research.google.com/drive/1zI9fV-i2besK1O2qfJ0Z24W6OSN5URW-?usp=sharing