## INSTALLATION

In [1]:
# install the OpenJDK 11 JDK on a Debian or Ubuntu-based system in a quiet and non-interactive manner
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
# download a file from a website
# !wget -nc -q https://dlcdn.apache.org/spark/spark-3.2.4/spark-3.2.4-bin-hadoop2.tgz
!wget -nc -q https://dlcdn.apache.org/spark/spark-3.2.4/spark-3.2.4-bin-hadoop2.7.tgz
# extract the contents of a file
!tar xf spark-3.2.4-bin-hadoop2.7.tgz

In [2]:
 # API for interacting with the Spark
!pip install pyspark==3.2.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark==3.2.4
  Downloading pyspark-3.2.4.tar.gz (281.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.5/281.5 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.4-py2.py3-none-any.whl size=282040940 sha256=7d7d8154b2401aadc56fd14ee171d2f1c408af4bf34f4cb8959d62390357ce7d
  Stored in directory: /root/.cache/pip/wheels/e7/e3/c8/c358dac750f2b6a4b03328d10e05a5c69501664bd6504b6c3e
Successfully built pyspark
Installing collected packages: py4j

## Set Environment Variables


In [3]:
# explore files and directories of any locations in colab
!ls /content/spark-3.2.4-bin-hadoop2.7/

bin   data	jars	    LICENSE   NOTICE  R		 RELEASE  yarn
conf  examples	kubernetes  licenses  python  README.md  sbin


In [4]:
import os, sys
# set the environment variable to the locations where Spark and Java are installed.
os.environ["JAVA_HOME"] =  "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.4-bin-hadoop2.7"
# add PySpark libraries to the system path:
sys.path.append("/content/spark-3.2.4-bin-hadoop2.7/python")
sys.path.append("/content/spark-3.2.4-bin-hadoop2.7/python/lib/py4j-0.10.9.5-src.zip")

In [5]:
# import dependencies
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.ml.feature import Binarizer, Bucketizer, OneHotEncoder, VectorAssembler, StringIndexer, MinMaxScalerModel, \
StandardScaler, Imputer, Tokenizer,StopWordsRemover, MinMaxScaler, PolynomialExpansion
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, ClusteringEvaluator, RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RankingMetrics
from random import randint
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np; import pandas as pd
from pyspark.sql.functions import concat, lit



In [6]:
# spark = SparkSession.builder.appName("PROJECT").getOrCreate()
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "80g") \
    .appName("PROJECT") \
    .getOrCreate()
sc = spark.sparkContext 

In [7]:
spark

## DATA PREPARATION

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
path = "/content/drive/MyDrive/Project Bigdata/DATA/"

In [10]:
# path = '/Project Bigdata/DATA/'

In [11]:
# online uers of retail sport merchandise company
df = spark.read.csv(path +
    "select_item.csv", 
    inferSchema=True, 
    header=True
)

df = df.withColumn('article_id', concat(lit('0'), df['article_id']))
# df.limit(10).show(truncate=False)
df.show()


+----------+------+
|article_id|labels|
+----------+------+
|0751399004|Blazer|
|0560559001|Blazer|
|0768285002|Blazer|
|0783144005|Blazer|
|0719378001|Blazer|
|0598636001|Blazer|
|0783245003|Blazer|
|0837368001|Blazer|
|0768433001|Blazer|
|0636455009|Blazer|
|0611745008|Blazer|
|0501288005|Blazer|
|0711031001|Blazer|
|0568601008|Blazer|
|0718908001|Blazer|
|0630542002|Blazer|
|0593009001|Blazer|
|0749815001|Blazer|
|0746292003|Blazer|
|0892327002|Blazer|
+----------+------+
only showing top 20 rows



In [12]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

label_stringIdx = StringIndexer(inputCol = "labels", outputCol = "label")
pipeline = Pipeline(stages=[label_stringIdx])
# Fit the pipeline to training documents.
# pipelineFit = pipeline.fit(df3)
# df4 = df3
pipelineFit = pipeline.fit(df)
df = pipelineFit.transform(df)
df.show(5)

+----------+------+-----+
|article_id|labels|label|
+----------+------+-----+
|0751399004|Blazer|  0.0|
|0560559001|Blazer|  0.0|
|0768285002|Blazer|  0.0|
|0783144005|Blazer|  0.0|
|0719378001|Blazer|  0.0|
+----------+------+-----+
only showing top 5 rows



In [13]:
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  8.0| 1000|
|  0.0| 1000|
|  7.0| 1000|
|  1.0| 1000|
|  4.0| 1000|
| 11.0| 1000|
| 14.0| 1000|
|  3.0| 1000|
|  2.0| 1000|
| 17.0| 1000|
| 10.0| 1000|
| 13.0| 1000|
|  6.0| 1000|
|  5.0| 1000|
| 15.0| 1000|
|  9.0| 1000|
| 16.0| 1000|
| 12.0| 1000|
+-----+-----+



In [15]:
# image_array18000_86.npy = 2048 features
# image_array18000_128.npy = 8192 features
# image_array18000_224.npy = 51200 features

features = np.load(path + 'image_array18000_86.npy')[::100]
names = np.load(path + 'image_name.npy')[::100]

In [16]:
features.shape,names.shape

((180, 2048), (180,))

In [17]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.ml.linalg import VectorUDT
from pyspark.ml.linalg import Vectors


names = [int(x) for x in names]
features = [Vectors.dense(features[i]) for i in range(len(features))]
schema = StructType([StructField("names", IntegerType()),StructField("features", VectorUDT(), True)])
 
data = [(names[i], features[i]) for i in range(len(names))]
df2 = spark.createDataFrame(data, schema)
df2 = df2.withColumn('names', concat(lit('0'), df2['names']))
df2.show()

+----------+--------------------+
|     names|            features|
+----------+--------------------+
|0751399004|[0.0,2.4363582134...|
|0547213004|[0.52874302864074...|
|0713995001|[3.34792375564575...|
|0783354006|[1.76661813259124...|
|0554704009|[0.0,0.0503984689...|
|0721270020|[3.37367630004882...|
|0708138026|[2.09837436676025...|
|0831450003|[2.11622977256774...|
|0589771003|[0.85632061958312...|
|0570232001|[3.13099694252014...|
|0688713001|[0.0,0.5182241201...|
|0680374003|[0.68154668807983...|
|0586917001|[3.56808948516845...|
|0583448002|[0.0,0.0,1.468342...|
|0665652003|[0.0,0.0,1.243653...|
|0788647004|[0.0,1.4536646604...|
|0566370003|[0.0,0.0,0.0,3.54...|
|0804681001|[0.53999042510986...|
|0653248003|[1.27588188648223...|
|0683572002|[0.0,0.0,0.0,0.0,...|
+----------+--------------------+
only showing top 20 rows



In [18]:
!python -V

Python 3.10.11


In [19]:
df3 = df.join(df2,df.article_id ==  df2.names,"inner")
df3 = df3.select(['article_id','labels','label','features'])
df3.show(5)

+----------+------+-----+--------------------+
|article_id|labels|label|            features|
+----------+------+-----+--------------------+
|0751399004|Blazer|  0.0|[0.0,2.4363582134...|
|0547213004|Blazer|  0.0|[0.52874302864074...|
|0713995001|Blazer|  0.0|[3.34792375564575...|
|0783354006|Blazer|  0.0|[1.76661813259124...|
|0554704009|Blazer|  0.0|[0.0,0.0503984689...|
+----------+------+-----+--------------------+
only showing top 5 rows



In [20]:
# df3.write.parquet(path + "/dataframe.parquet")
# df3.write.option("header", "true").option("delimiter", "|").option("encoding", "UTF-8").format("csv").save(path + "df2048.csv")


## TRAIN

In [21]:
model_df=df3.select(['features','label'])
# model_df = model_df.sample(0.01)
model_df.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,2.4363582134...|  0.0|
|[0.52874302864074...|  0.0|
|[3.34792375564575...|  0.0|
|[1.76661813259124...|  0.0|
|[0.0,0.0503984689...|  0.0|
+--------------------+-----+
only showing top 5 rows



In [22]:
train, test = model_df.randomSplit([0.7,0.3], 42)
# check whether the target class is balanced

# train.groupBy('label').count().show()
# test.groupBy('label').count().show()

## LOGISTIC REGRESSION

In [23]:
# model=LogisticRegression(featuresCol='features',labelCol='label').fit(train)

In [24]:
model = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.3,featuresCol='features', labelCol="label")
p_model = model.fit(train)

In [25]:
df_test_LR = p_model.transform(test)
df_test_LR.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,0.0,0.0,0.33...|  6.0|[2.52669848289016...|[0.11851679461678...|       1.0|
|[0.0,0.0,0.0,1.96...|  5.0|[0.31194161738398...|[0.03996540758723...|       2.0|
|[0.0,0.0,0.0,3.54...|  1.0|[0.80997752628194...|[0.04682297613299...|       8.0|
|[0.0,0.0,0.111198...|  7.0|[0.14503239585676...|[0.03460197689516...|      15.0|
|[0.0,0.0,1.468342...|  1.0|[-0.1207713162773...|[0.03227861773708...|      10.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [26]:
# lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
# model = lr.fit(train)

In [27]:
evaluator_LR = MulticlassClassificationEvaluator(predictionCol="prediction")
print("LOGISTIC REGRESSION:")
print("accuracy: " , evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "accuracy"}))
print("precision: " ,evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "weightedPrecision"}))
print("recall: " ,evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "weightedRecall"}))
print("f1: " ,evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "f1"}))

LOGISTIC REGRESSION:
accuracy:  0.11290322580645161
precision:  0.19066820276497695
recall:  0.11290322580645161
f1:  0.12550388679420937


## RANDOM FOREST

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 4)
# Train model with Training Data
rfModel = rf.fit(train)
df_test_RF = rfModel.transform(test)
df_test_RF.filter(df_test_RF['prediction'] == 0).orderBy("probability", ascending=False).show(n = 10, truncate = 30)
#     .select("Descript","Category","probability","label","prediction") \
#     .orderBy("probability", ascending=False) \
#     .show(n = 10, truncate = 30)

+------------------------------+-----+------------------------------+------------------------------+----------+
|                      features|label|                 rawPrediction|                   probability|prediction|
+------------------------------+-----+------------------------------+------------------------------+----------+
|[0.0,0.0,0.0,0.930753946304...|  0.0|[38.7368923941231,3.3009181...|[0.38736892394123096,0.0330...|       0.0|
|[0.0402957946062088,1.28079...|  0.0|[37.64903825794322,3.456849...|[0.3764903825794322,0.03456...|       0.0|
|[0.0,0.0,0.0,0.0,0.18394246...|  0.0|[35.95637107783837,2.950651...|[0.3595637107783837,0.02950...|       0.0|
|[0.0,0.0,0.0,0.055857688188...|  0.0|[35.484450075686546,2.98907...|[0.3548445007568656,0.02989...|       0.0|
|[0.9166262149810791,0.02272...|  0.0|[34.484181039745366,3.58566...|[0.3448418103974537,0.03585...|       0.0|
|[0.0,1.4411989450454712,0.4...|  0.0|[33.82406094878889,3.243962...|[0.3382406094878889,0.03243...|    

In [None]:
evaluator_RF = MulticlassClassificationEvaluator(predictionCol="prediction")
print("RANDOM FOREST:")
print("accuracy: " ,evaluator_RF.evaluate(df_test_RF, {evaluator_RF.metricName: "accuracy"}))
# print("sensitivity: " , evaluator_RF.evaluate(df_test_RF, {evaluator_RF.metricName: "truePositiveRateByLabel"}))

RANDOM FOREST:
accuracy:  0.522077922077922


## EVALUATION

In [None]:
evaluator_LR = MulticlassClassificationEvaluator(predictionCol="prediction")
print("LOGISTIC REGRESSION:")
print("accuracy: " , evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "accuracy"}))
# print("sensitivity: " , evaluator_LR.evaluate(df_test_LR, {evaluator_LR.metricName: "truePositiveRateByLabel"}))

LOGISTIC REGRESSION:
accuracy:  0.6868274582560296


In [None]:
evaluator_RF = MulticlassClassificationEvaluator(predictionCol="prediction")
print("RANDOM FOREST:")
print("accuracy: " ,evaluator_RF.evaluate(df_test_RF, {evaluator_RF.metricName: "accuracy"}))
# print("sensitivity: " , evaluator_RF.evaluate(df_test_RF, {evaluator_RF.metricName: "truePositiveRateByLabel"}))

RANDOM FOREST:
accuracy:  0.522077922077922


Save model

In [None]:
p_model.save("/content/drive/MyDrive/logistic_regression_model")

In [None]:
rfModel.save("/content/drive/MyDrive/random_forest_model")