## INSTALLATION

In [None]:
# install the OpenJDK 11 JDK on a Debian or Ubuntu-based system in a quiet and non-interactive manner
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
# download a file from a website
# !wget -nc -q https://dlcdn.apache.org/spark/spark-3.2.4/spark-3.2.4-bin-hadoop2.tgz
!wget -nc -q https://dlcdn.apache.org/spark/spark-3.2.4/spark-3.2.4-bin-hadoop2.7.tgz
# extract the contents of a file
!tar xf spark-3.2.4-bin-hadoop2.7.tgz

In [None]:
 # API for interacting with the Spark
!pip install pyspark==3.2.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Set Environment Variables


In [None]:
# explore files and directories of any locations in colab
!ls /content/spark-3.2.4-bin-hadoop2.7/

bin   data	jars	    LICENSE   NOTICE  R		 RELEASE  yarn
conf  examples	kubernetes  licenses  python  README.md  sbin


In [None]:
import os, sys
# set the environment variable to the locations where Spark and Java are installed.
os.environ["JAVA_HOME"] =  "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.4-bin-hadoop2.7"
# add PySpark libraries to the system path:
sys.path.append("/content/spark-3.2.4-bin-hadoop2.7/python")
sys.path.append("/content/spark-3.2.4-bin-hadoop2.7/python/lib/py4j-0.10.9.5-src.zip")

In [None]:
# import dependencies
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.ml.feature import Binarizer, Bucketizer, OneHotEncoder, VectorAssembler, StringIndexer, MinMaxScalerModel, \
StandardScaler, Imputer, Tokenizer,StopWordsRemover, MinMaxScaler, PolynomialExpansion
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, ClusteringEvaluator, RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RankingMetrics
from random import randint
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np; import pandas as pd
from pyspark.sql.functions import concat, lit



In [None]:
# spark = SparkSession.builder.appName("PROJECT").getOrCreate()
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "80g") \
    .appName("PROJECT") \
    .getOrCreate()
sc = spark.sparkContext 

In [None]:
spark

## DATA PREPARATION

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = "/content/drive/MyDrive/Project Bigdata/DATA/"

In [None]:
# path = '/Project Bigdata/DATA/'

In [None]:
# online uers of retail sport merchandise company
df = spark.read.csv(path +
    "select_item.csv", 
    inferSchema=True, 
    header=True
)

df = df.withColumn('article_id', concat(lit('0'), df['article_id']))
# df.limit(10).show(truncate=False)
df.show()


+----------+------+
|article_id|labels|
+----------+------+
|0751399004|Blazer|
|0560559001|Blazer|
|0768285002|Blazer|
|0783144005|Blazer|
|0719378001|Blazer|
|0598636001|Blazer|
|0783245003|Blazer|
|0837368001|Blazer|
|0768433001|Blazer|
|0636455009|Blazer|
|0611745008|Blazer|
|0501288005|Blazer|
|0711031001|Blazer|
|0568601008|Blazer|
|0718908001|Blazer|
|0630542002|Blazer|
|0593009001|Blazer|
|0749815001|Blazer|
|0746292003|Blazer|
|0892327002|Blazer|
+----------+------+
only showing top 20 rows



In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

label_stringIdx = StringIndexer(inputCol = "labels", outputCol = "label")
pipeline = Pipeline(stages=[label_stringIdx])
# Fit the pipeline to training documents.
# pipelineFit = pipeline.fit(df3)
# df4 = df3
pipelineFit = pipeline.fit(df)
df = pipelineFit.transform(df)
df.show(5)

+----------+------+-----+
|article_id|labels|label|
+----------+------+-----+
|0751399004|Blazer|  0.0|
|0560559001|Blazer|  0.0|
|0768285002|Blazer|  0.0|
|0783144005|Blazer|  0.0|
|0719378001|Blazer|  0.0|
+----------+------+-----+
only showing top 5 rows



In [None]:
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  8.0| 1000|
|  0.0| 1000|
|  7.0| 1000|
|  1.0| 1000|
|  4.0| 1000|
| 11.0| 1000|
| 14.0| 1000|
|  3.0| 1000|
|  2.0| 1000|
| 17.0| 1000|
| 10.0| 1000|
| 13.0| 1000|
|  6.0| 1000|
|  5.0| 1000|
| 15.0| 1000|
|  9.0| 1000|
| 16.0| 1000|
| 12.0| 1000|
+-----+-----+



In [None]:
# image_array18000_86.npy = 2048 features
# image_array18000_128.npy = 8192 features
# image_array18000_224.npy = 51200 features

features = np.load('/content/drive/MyDrive/ProjectBigData/image_array18000_224.npy')[::2]
names = np.load(path + 'image_name.npy')[::2]

In [None]:
features.shape,names.shape

((9000, 51200), (9000,))

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.ml.linalg import VectorUDT
from pyspark.ml.linalg import Vectors


names = [int(x) for x in names]
features = [Vectors.dense(features[i]) for i in range(len(features))]
schema = StructType([StructField("names", IntegerType()),StructField("features", VectorUDT(), True)])
 
data = [(names[i], features[i]) for i in range(len(names))]
df2 = spark.createDataFrame(data, schema)
df2 = df2.withColumn('names', concat(lit('0'), df2['names']))
df2.show()

+----------+--------------------+
|     names|            features|
+----------+--------------------+
|0751399004|[0.0,0.6520429849...|
|0768285002|[0.0,0.0499669909...|
|0719378001|[0.0,0.2787274122...|
|0783245003|[0.48316261172294...|
|0768433001|[0.0,0.6859197616...|
|0611745008|[0.01672206819057...|
|0711031001|[0.0,0.7223180532...|
|0718908001|[0.34424966573715...|
|0593009001|[0.0,0.0,0.0,0.0,...|
|0746292003|[0.0,0.0,0.0,0.09...|
|0504413001|[0.0,0.0,0.0,0.0,...|
|0497369001|[0.0,0.0,0.113766...|
|0747152002|[0.0,1.4898970127...|
|0813410001|[0.37753048539161...|
|0615176001|[0.0,0.1304044425...|
|0728156022|[0.71566152572631...|
|0724699001|[0.0,0.1426048278...|
|0588251002|[0.07893636822700...|
|0931981001|[0.0,1.4464067220...|
|0740307001|[0.0,2.0978810787...|
+----------+--------------------+
only showing top 20 rows



In [None]:
!python -V

In [None]:
df3 = df.join(df2,df.article_id ==  df2.names,"inner")
df3 = df3.select(['article_id','labels','label','features'])
df3.show(5)

+----------+------+-----+--------------------+
|article_id|labels|label|            features|
+----------+------+-----+--------------------+
|0751399004|Blazer|  0.0|[0.0,0.6520429849...|
|0768285002|Blazer|  0.0|[0.0,0.0499669909...|
|0719378001|Blazer|  0.0|[0.0,0.2787274122...|
|0783245003|Blazer|  0.0|[0.48316261172294...|
|0768433001|Blazer|  0.0|[0.0,0.6859197616...|
+----------+------+-----+--------------------+
only showing top 5 rows



In [None]:
# df3.write.parquet(path + "/dataframe.parquet")
# df3.write.option("header", "true").option("delimiter", "|").option("encoding", "UTF-8").format("csv").save(path + "df2048.csv")


## TRAIN

In [None]:
model_df=df3.select(['features','label'])
# model_df = model_df.sample(0.01)
model_df.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,0.6520429849...|  0.0|
|[0.0,0.0499669909...|  0.0|
|[0.0,0.2787274122...|  0.0|
|[0.48316261172294...|  0.0|
|[0.0,0.6859197616...|  0.0|
+--------------------+-----+
only showing top 5 rows



In [None]:
train, test = model_df.randomSplit([0.7,0.3], 42)
# check whether the target class is balanced

# train.groupBy('label').count().show()
# test.groupBy('label').count().show()

## LOGISTIC REGRESSION

In [None]:
# model=LogisticRegression(featuresCol='features',labelCol='label').fit(train)

In [None]:
model = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.3,featuresCol='features', labelCol="label")
p_model = model.fit(train)

In [None]:
df_test_LR = p_model.transform(test)
df_test_LR.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,0.0,0.0,0.0,...|  0.0|[5.29844103172469...|[0.72604514099544...|       0.0|
|[0.0,0.0,0.0,0.0,...|  0.0|[3.48746410438500...|[0.30623947866569...|       0.0|
|[0.0,0.0,0.0,0.0,...|  0.0|[2.15112915072513...|[0.06335698131001...|       6.0|
|[0.0,0.0,0.0,0.0,...|  1.0|[1.60277427780682...|[0.04667803743692...|      10.0|
|[0.0,0.0,0.0,0.0,...|  0.0|[4.62725043670029...|[0.60462307083381...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
# lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
# model = lr.fit(train)

In [None]:
evaluator_LR = MulticlassClassificationEvaluator(predictionCol="prediction")
print("LOGISTIC REGRESSION:")
print("accuracy: " , evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "accuracy"}))
print("precision: " ,evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "weightedPrecision"}))
print("recall: " ,evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "weightedRecall"}))
print("f1: " ,evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "f1"}))

LOGISTIC REGRESSION:
accuracy:  0.6868274582560296
precision:  0.6877715344630188
recall:  0.6868274582560295
f1:  0.6784804754597942


### Hyper-Parameter Tuning

In [None]:


lr = LogisticRegression(maxIter=100)

# paramGrid = ParamGridBuilder()\
#     .addGrid(lr.regParam, [0.1, 0.01,0.001,0]) \
#     .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
#     .addGrid(lr.threshold, [0.2, 0.5, 0.7,0.9])\
#     .build()


# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1,0.05, 0.01,0.001]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5,0.75,1])\
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator_LR,
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)

# Make predictions on test data. model is the model with combination of parameters
# that performed best.
model.transform(test)\
    .select("features", "label", "prediction")\
    .show()

In [None]:
df_test_tune_LR = model.transform(test)


In [None]:
print("TUNED LOGISTIC REGRESSION:")
print("accuracy: " ,evaluator_LR.evaluate(df_test_tune_LR, {evaluator_LR.metricName: "accuracy"}))
print("precision: " ,evaluator_LR.evaluate(df_test_tune_LR, {evaluator_LR.metricName: "weightedPrecision"}))
print("recall: " ,evaluator_LR.evaluate(df_test_tune_LR, {evaluator_LR.metricName: "weightedRecall"}))
print("f1: " ,evaluator_LR.evaluate(df_test_tune_LR, {evaluator_LR.metricName: "f1"}))

## RANDOM FOREST

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 4)
# Train model with Training Data
rfModel = rf.fit(train)
df_test_RF = rfModel.transform(test)
df_test_RF.filter(df_test_RF['prediction'] == 0).orderBy("probability", ascending=False).show(n = 10, truncate = 30)
#     .select("Descript","Category","probability","label","prediction") \
#     .orderBy("probability", ascending=False) \
#     .show(n = 10, truncate = 30)

In [None]:
evaluator_RF = MulticlassClassificationEvaluator(predictionCol="prediction")
print("RANDOM FOREST:")
print("accuracy: " ,evaluator_RF.evaluate(df_test_RF, {evaluator_RF.metricName: "accuracy"}))
print("precision: " ,evaluator_RF.evaluate(df_test_RF, {evaluator_RF.metricName: "weightedPrecision"}))
print("recall: " ,evaluator_RF.evaluate(df_test_RF, {evaluator_RF.metricName: "weightedRecall"}))
print("f1: " ,evaluator_RF.evaluate(df_test_RF, {evaluator_RF.metricName: "f1"}))

### Hyper-Parameter Tuning

In [None]:


rf = RandomForestClassifier()

# paramGrid = ParamGridBuilder()\
#     .addGrid(lr.regParam, [0.1, 0.01,0.001,0]) \
#     .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
#     .addGrid(lr.threshold, [0.2, 0.5, 0.7,0.9])\
#     .build()


# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(rf.numTrees, [int(x) for x in np.linspace(start = 100, stop = 500, num = 100)])\
    .addGrid(rf.maxDepth, [int(x) for x in np.linspace(start = 2, stop = 10, num = 2)])\
    .addGrid(rf.minInstancesPerNode, [int(x) for x in np.linspace(start = 1, stop = 3, num = 1)]) \
    .build()
 
# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
rf_tvs = TrainValidationSplit(estimator=rf,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator_RF,
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
rf_tune_model = rf_tvs.fit(train)

# Make predictions on test data. model is the model with combination of parameters
# that performed best.
rf_tune_model.transform(test)\
    .select("features", "label", "prediction")\
    .show()

In [None]:
df_test_tune_RF = rf_tune_model.transform(test)
print("TUNED RANDOM FOREST:")
print("accuracy: " ,evaluator_RF.evaluate(df_test_tune_RF, {evaluator_RF.metricName: "accuracy"}))
print("precision: " ,evaluator_RF.evaluate(df_test_tune_RF, {evaluator_RF.metricName: "weightedPrecision"}))
print("recall: " ,evaluator_RF.evaluate(df_test_tune_RF, {evaluator_RF.metricName: "weightedRecall"}))
print("f1: " ,evaluator_RF.evaluate(df_test_tune_RF, {evaluator_RF.metricName: "f1"}))

## EVALUATION

In [None]:
evaluator_LR = MulticlassClassificationEvaluator(predictionCol="prediction")
print("LOGISTIC REGRESSION:")
print("accuracy: " , evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "accuracy"}))
# print("sensitivity: " , evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "truePositiveRateByLabel"}))

In [None]:
evaluator_RF = MulticlassClassificationEvaluator(predictionCol="prediction")
print("RANDOM FOREST:")
print("accuracy: " ,evaluator_RF.evaluate(df_test_RF, {evaluator_RF.metricName: "accuracy"}))
# print("sensitivity: " , evaluator_RF.evaluate(df_test_RF, {evaluator_RF.metricName: "truePositiveRateByLabel"}))

Save model

In [None]:
p_model.save("/content/drive/MyDrive/logistic_regression_model")

In [None]:
rfModel.save("/content/drive/MyDrive/random_forest_model")