In [50]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [41]:
df = spark.read.format('csv').options(inferSchema=True, header=True).load('/content/drive/MyDrive/dog_food.csv')

In [42]:
df.printSchema() #general look

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [43]:
df.describe().show() #no NAs

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [44]:
df.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [53]:
#We do not need to separate the data in train and test, since we want to know which one is damaging them

In [52]:
from pyspark.ml.feature import VectorAssembler #creating the features

In [21]:
assembler = VectorAssembler(
    inputCols=['A', 'B', 'C', 'D'],
    outputCol='features')

In [22]:
newdf = assembler.transform(df).select('features', 'Spoiled')

In [23]:
newdf.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Spoiled: double (nullable = true)



In [24]:
newdf.filter(newdf['Spoiled'] == 1).count()

140

In [55]:
newdf.filter(newdf['Spoiled'] == 0).count()

350

In [27]:
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier,
                                       DecisionTreeClassifier)

In [28]:
dtc = DecisionTreeClassifier(labelCol = 'Spoiled')
rfc = RandomForestClassifier(numTrees = 100, labelCol = 'Spoiled')
gbt = GBTClassifier(labelCol = 'Spoiled')

In [29]:
dtc_model = dtc.fit(newdf)
rfc_model = rfc.fit(newdf)
gbt_model = gbt.fit(newdf)

In [56]:
#in all cases we see that the C has the most important roll in the spoilage ↓

In [30]:
dtc_model.featureImportances

SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})

In [31]:
rfc_model.featureImportances

SparseVector(4, {0: 0.019, 1: 0.0213, 2: 0.9381, 3: 0.0216})

In [32]:
gbt_model.featureImportances

SparseVector(4, {0: 0.0296, 1: 0.0383, 2: 0.8286, 3: 0.1034})

In [34]:
spoiled_df = df.filter(df['Spoiled'] == 1).select('C', 'Spoiled')

In [35]:
nonspoiled_df = df.filter(df['Spoiled'] == 0).select('C', 'Spoiled')

In [36]:
spoiled_df.describe().show()

+-------+------------------+-------+
|summary|                 C|Spoiled|
+-------+------------------+-------+
|  count|               140|    140|
|   mean|11.914285714285715|    1.0|
| stddev|0.9706907300060253|    0.0|
|    min|               9.0|    1.0|
|    max|              14.0|    1.0|
+-------+------------------+-------+



In [37]:
nonspoiled_df.describe().show()

+-------+-----------------+-------+
|summary|                C|Spoiled|
+-------+-----------------+-------+
|  count|              350|    350|
|   mean| 8.01142857142857|    0.0|
| stddev|1.086455140730764|    0.0|
|    min|              5.0|    0.0|
|    max|             11.0|    0.0|
+-------+-----------------+-------+



In [None]:
#the average of the spoiled is almost 4 units higher than that of the non-spoiled (11.91 vs 8.01)

In [None]:
#TO CONCLUDE -> we should reduce this chemical and keep its maximum amount at 9, which is the minimum in which there have been cases of spoiled