To Load Dataset from Kaggle API Run below steps

In [None]:
! pip install -q kaggle

In [None]:
from google.colab import files

files.upload()

In [None]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

In [None]:
 ! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d gpiosenka/100-bird-species

Downloading 100-bird-species.zip to /content
100% 1.89G/1.89G [01:29<00:00, 24.2MB/s]
100% 1.89G/1.89G [01:29<00:00, 22.7MB/s]


In [None]:
!unzip 100-bird-species.zip

In [None]:
data.printSchema()


root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: string (nullable = true)



In [None]:
#############################

Loading Dataset from Drive

In [None]:
pip install pyspark
#installing Pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pyspark.sql.functions import input_file_name, udf, StringType, col
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, PCA
from pyspark.ml.linalg import VectorUDT, Vectors
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



#initializing Spark
conf = SparkConf().setAppName("birdy").set("spark.driver.memory", "8g")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.appName("birdy").getOrCreate()


# Set the path to the directory containing the subdirectories
path = "/content/drive/MyDrive/train"

# Read images and create DataFrame
data = spark.read.format("image").option("recursiveFileLookup", "true").load(path)

# Extract label from path
get_label = udf(lambda x: x.split('/')[-2], StringType())
data = data.withColumn("label", get_label(input_file_name()))

# Select only the image and label columns
data = data.select("image", "label")

# Show the DataFrame
data.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: string (nullable = true)



In [None]:
data.show()

+--------------------+--------------------+
|               image|               label|
+--------------------+--------------------+
|{file:///content/...|ABYSSINIAN_GROUND...|
|{file:///content/...|ABYSSINIAN_GROUND...|
|{file:///content/...|AFRICAN_CROWNED_C...|
|{file:///content/...|       ABBOTTS_BOOBY|
|{file:///content/...|AFRICAN_CROWNED_C...|
|{file:///content/...|AFRICAN_CROWNED_C...|
|{file:///content/...|ABYSSINIAN_GROUND...|
|{file:///content/...|AFRICAN_CROWNED_C...|
|{file:///content/...|ABYSSINIAN_GROUND...|
|{file:///content/...|AFRICAN_CROWNED_C...|
|{file:///content/...|ABYSSINIAN_GROUND...|
|{file:///content/...|       ABBOTTS_BOOBY|
|{file:///content/...|AFRICAN_CROWNED_C...|
|{file:///content/...|AFRICAN_CROWNED_C...|
|{file:///content/...|       ABBOTTS_BOOBY|
|{file:///content/...|ABYSSINIAN_GROUND...|
|{file:///content/...|ABYSSINIAN_GROUND...|
|{file:///content/...|ABYSSINIAN_GROUND...|
|{file:///content/...|AFRICAN_CROWNED_C...|
|{file:///content/...|AFRICAN_CR

In [None]:
# Convert label column to numeric
labelIndexer = StringIndexer(inputCol="label", outputCol="label_index").fit(data)
data = labelIndexer.transform(data)

In [None]:
data1=data

In [None]:
data2=data1.select('image','label_index')

In [None]:
data2=data2.withColumnRenamed("label_index","label")
#renaming label_index to label

In [None]:
data2.show()

+--------------------+-----+
|               image|label|
+--------------------+-----+
|{file:///content/...|  1.0|
|{file:///content/...|  1.0|
|{file:///content/...|  4.0|
|{file:///content/...|  0.0|
|{file:///content/...|  4.0|
|{file:///content/...|  4.0|
|{file:///content/...|  1.0|
|{file:///content/...|  4.0|
|{file:///content/...|  1.0|
|{file:///content/...|  4.0|
|{file:///content/...|  1.0|
|{file:///content/...|  0.0|
|{file:///content/...|  4.0|
|{file:///content/...|  4.0|
|{file:///content/...|  0.0|
|{file:///content/...|  1.0|
|{file:///content/...|  1.0|
|{file:///content/...|  1.0|
|{file:///content/...|  4.0|
|{file:///content/...|  4.0|
+--------------------+-----+
only showing top 20 rows



In [None]:
data3=data2.select("image.origin",'image.data', "image.height", "image.width", "image.mode", "image.nChannels",'label')

In [None]:
data3.show()

+--------------------+--------------------+------+-----+----+---------+-----+
|              origin|                data|height|width|mode|nChannels|label|
+--------------------+--------------------+------+-----+----+---------+-----+
|file:///content/d...|[80 B9 BB 6A A1 A...|   224|  224|  16|        3|  1.0|
|file:///content/d...|[1E C6 B5 16 B8 A...|   224|  224|  16|        3|  1.0|
|file:///content/d...|[72 A8 9B 39 6B 5...|   224|  224|  16|        3|  4.0|
|file:///content/d...|[55 78 8C 3D 5B 6...|   224|  224|  16|        3|  0.0|
|file:///content/d...|[6B BC 8D 65 BB 8...|   224|  224|  16|        3|  4.0|
|file:///content/d...|[4B 8A 7A 19 5B 4...|   224|  224|  16|        3|  4.0|
|file:///content/d...|[3E 5A 5A 00 18 1...|   224|  224|  16|        3|  1.0|
|file:///content/d...|[30 B0 93 52 C9 B...|   224|  224|  16|        3|  4.0|
|file:///content/d...|[64 A3 AB 72 B0 B...|   224|  224|  16|        3|  1.0|
|file:///content/d...|[21 44 29 1D 44 2...|   224|  224|  16|   

In [None]:
#converting the binary values of data to vector
binary_to_vector_udf = udf(lambda x: Vectors.dense(list(x)), VectorUDT())

# apply the UDF to the "data" column and create a new column "features"
data6 = data3.withColumn("features", binary_to_vector_udf("data"))

In [None]:
data6.show()

+--------------------+--------------------+------+-----+----+---------+-----+--------------------+
|              origin|                data|height|width|mode|nChannels|label|            features|
+--------------------+--------------------+------+-----+----+---------+-----+--------------------+
|file:///content/d...|[80 B9 BB 6A A1 A...|   224|  224|  16|        3|  1.0|[128.0,185.0,187....|
|file:///content/d...|[1E C6 B5 16 B8 A...|   224|  224|  16|        3|  1.0|[30.0,198.0,181.0...|
|file:///content/d...|[72 A8 9B 39 6B 5...|   224|  224|  16|        3|  4.0|[114.0,168.0,155....|
|file:///content/d...|[55 78 8C 3D 5B 6...|   224|  224|  16|        3|  0.0|[85.0,120.0,140.0...|
|file:///content/d...|[6B BC 8D 65 BB 8...|   224|  224|  16|        3|  4.0|[107.0,188.0,141....|
|file:///content/d...|[4B 8A 7A 19 5B 4...|   224|  224|  16|        3|  4.0|[75.0,138.0,122.0...|
|file:///content/d...|[3E 5A 5A 00 18 1...|   224|  224|  16|        3|  1.0|[62.0,90.0,90.0,0...|
|file:///c

In [None]:
data7=data6.select('features','label') #selecting only features and labels for training

In [None]:
(trainingData, testData) = data7.randomSplit([0.8, 0.2], seed=42) #splitting the data

In [None]:
lr = LogisticRegression(maxIter=20)
model = lr.fit(trainingData) #fitting on Logisitic Regression Model

In [None]:
predictions = model.transform(testData)
#predictions on test data

In [None]:
predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[30.0,198.0,181.0...|  1.0|[-5.6046076044735...|[3.99438448725595...|       1.0|
|[47.0,96.0,94.0,1...|  1.0|[-3.4988137171916...|[7.33351468638078...|       1.0|
|[58.0,191.0,134.0...|  3.0|[-3.8939770817482...|[1.48794753564539...|       3.0|
|[85.0,120.0,140.0...|  0.0|[5.98623830748267...|[0.98318515424168...|       0.0|
|[108.0,192.0,163....|  1.0|[-3.5879717304167...|[6.64705276936777...|       1.0|
|[121.0,121.0,135....|  4.0|[1.02928862746903...|[0.16640416930962...|       4.0|
|[182.0,153.0,146....|  1.0|[-0.3442425176319...|[0.01795590517293...|       1.0|
|[0.0,32.0,37.0,2....|  0.0|[4.11652349272566...|[0.97149653472456...|       0.0|
|[39.0,52.0,38.0,3...|  4.0|[-1.4729411768151...|[4.29414691599960...|       4.0|
|[43.0,78.0,51.0

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = {}".format(accuracy)) #accuracy of predictions

Accuracy = 0.9299363057324841


In [None]:
#Random Forest implementation for the Same 
scaler = StandardScaler(inputCol="features", outputCol="scaled_feature_vector")
scaled_df = scaler.fit(data7).transform(data7)

# Perform PCA on the feature vector
pca = PCA(k=50, inputCol="scaled_feature_vector", outputCol="pca_features")
pca_model = pca.fit(scaled_df)
pca_df = pca_model.transform(scaled_df)

# Split the dataset into training and test sets
(training_data, test_data) = pca_df.randomSplit([0.8, 0.2])

# Define the classification model
classifier = RandomForestClassifier(labelCol="label", featuresCol="pca_features", numTrees=10)

# Fit the model on the training data
model = classifier.fit(training_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = {}".format(accuracy))