In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=9689dc2f8f67fb933273beb6b3f8315147157ad5e11576a54e58452794399302
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("DogFood").getOrCreate()

In [6]:
data = spark.read.csv("/content/sample_data/dog_food.csv",inferSchema=True, header=True)

In [7]:
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [8]:
data.head(1)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)]

In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [11]:
assembler = VectorAssembler(inputCols= ['A', 'B','C', 'D'], outputCol='features')

In [14]:
output = assembler.transform(data)

In [15]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [16]:
from pyspark.ml.classification import RandomForestClassifier

In [17]:
rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features')

In [23]:
final_data = output.select('features', 'Spoiled')

In [27]:
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Spoiled: double (nullable = true)



In [28]:
rfc_model = rfc.fit(final_data)

In [29]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0198, 1: 0.0193, 2: 0.9399, 3: 0.021})

In [30]:
spoiled_data = data.filter(data['Spoiled'] == 1).select('C', 'Spoiled')

In [31]:
spoiled_data.describe().show()

+-------+------------------+-------+
|summary|                 C|Spoiled|
+-------+------------------+-------+
|  count|               140|    140|
|   mean|11.914285714285715|    1.0|
| stddev|0.9706907300060253|    0.0|
|    min|               9.0|    1.0|
|    max|              14.0|    1.0|
+-------+------------------+-------+



In [34]:
non_spoiled_data = data.filter(data['Spoiled'] == 0).select('C', 'Spoiled')

In [35]:
non_spoiled_data.describe().show()

+-------+-----------------+-------+
|summary|                C|Spoiled|
+-------+-----------------+-------+
|  count|              350|    350|
|   mean| 8.01142857142857|    0.0|
| stddev|1.086455140730764|    0.0|
|    min|              5.0|    0.0|
|    max|             11.0|    0.0|
+-------+-----------------+-------+



- spoiled data has chemical C with mean of 11.9
- non-spoiled data has chemical C with mean of 8.0
- looks like we need to reduce chemical C to <= 9!