In [1]:
import numpy as np
import pandas as pd


In [3]:
crop_data = pd.read_csv('crop_recommendation.csv')
crop_data.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [6]:
crop_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [5]:
crop_data.isnull().sum()

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

In [7]:
crop_datax = crop_data.drop('label', axis = 1)
y = crop_data['label']

In [8]:
y


0         rice
1         rice
2         rice
3         rice
4         rice
         ...  
2195    coffee
2196    coffee
2197    coffee
2198    coffee
2199    coffee
Name: label, Length: 2200, dtype: object

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [53]:
# Encoding the 'label' column to integers
label_mapping = {label: index for index, label in enumerate(crop_data['label'].unique())}
inverse_label_mapping = {index: label for label, index in label_mapping.items()}


In [54]:
# Convert labels to integer indices
y = crop_data['label'].map(label_mapping).values

In [55]:
# Features (X) - Assuming all columns except 'label' are features
X = crop_data.drop('label', axis=1).values


In [56]:
# Function to perform bootstrap sampling
def bootstrap_sample(X, y):
    indices = np.random.choice(len(X), size=len(X), replace=True)
    return X[indices], y[indices]



In [57]:
# Function to split data for decision tree
def split_data(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] < threshold
    right_mask = ~left_mask
    return X[left_mask], X[right_mask], y[left_mask], y[right_mask]


In [58]:
# Decision Tree Node definition
class TreeNode:
    def __init__(self, predicted_class=None, feature_index=None, threshold=None, left=None, right=None):
        self.predicted_class = predicted_class
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right



In [59]:
# Function to calculate Gini impurity
def calculate_gini(y):
    if len(y) == 0:
        return 0
    y = y.astype(int)
    counts = np.bincount(y)
    probabilities = counts / len(y)
    gini = 1 - np.sum(probabilities ** 2)
    return gini

In [60]:
# Decision Tree construction
def build_tree(X, y, max_depth):
    if max_depth <= 0 or len(np.unique(y)) == 1:
        return TreeNode(predicted_class=np.bincount(y).argmax())

    num_samples, num_features = X.shape
    best_gini = float('inf')
    best_criteria = None

    for feature_index in range(num_features):
        for threshold in np.unique(X[:, feature_index]):
            X_left, X_right, y_left, y_right = split_data(X, y, feature_index, threshold)
            if len(y_left) > 0 and len(y_right) > 0:
                gini = (len(y_left) / len(y)) * calculate_gini(y_left) + (len(y_right) / len(y)) * calculate_gini(y_right)
                if gini < best_gini:
                    best_gini = gini
                    best_criteria = (feature_index, threshold)
                    best_sets = (X_left, X_right, y_left, y_right)

    if best_criteria is None:
        return TreeNode(predicted_class=np.bincount(y).argmax())

    X_left, X_right, y_left, y_right = best_sets
    left_tree = build_tree(X_left, y_left, max_depth - 1)
    right_tree = build_tree(X_right, y_right, max_depth - 1)
    return TreeNode(predicted_class=None, feature_index=best_criteria[0], threshold=best_criteria[1], left=left_tree, right=right_tree)


In [61]:
# Random Forest implementation
class RandomForest:
    def __init__(self, n_trees=100, max_depth=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_trees):
            X_sample, y_sample = bootstrap_sample(X, y)
            tree = build_tree(X_sample, y_sample, max_depth=self.max_depth)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.zeros((len(X), len(self.trees)))
        for i, tree in enumerate(self.trees):
            predictions[:, i] = [self._predict_tree(x, tree) for x in X]
        return np.round(np.mean(predictions, axis=1)).astype(int)

    def _predict_tree(self, x, tree):
        if tree.predicted_class is not None:
            return tree.predicted_class
        else:
            feature_index = tree.feature_index
            if feature_index >= len(x):
                raise IndexError(f"Tree tried to split on feature index {feature_index}, which is out of bounds.")
            
            if x[feature_index] < tree.threshold:
                return self._predict_tree(x, tree.left)
            else:
                return self._predict_tree(x, tree.right)

In [62]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
# Initialize the Random Forest with specified number of trees and max depth
random_forest = RandomForest(n_trees=10, max_depth=3)


In [64]:
# Fit the model on the training data
random_forest.fit(X_train, y_train)

In [65]:
# Predict the labels for the test set
y_pred = random_forest.predict(X_test)

In [66]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.15


In [67]:
# Example usage
random_forest = RandomForest(n_trees=10, max_depth=3)
random_forest.fit(X, y)

In [68]:
# Example prediction (replace with actual features)
new_data_point = np.array([[90, 40, 60, 20, 80, 7.0, 150]]) 
arr = [[90,42,43,20.879744,82.002744,6.502985,202.935536]] # Ensure this has the same number of features
predicted_class_index = random_forest.predict(new_data_point)

In [69]:
# Convert the predicted class index back to the original label
predicted_label = inverse_label_mapping[predicted_class_index[0]]
print("Prediction:", predicted_label )

Prediction: grapes
