In [69]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [52]:
# load dataset

df = pd.read_csv(r'C:\Users\omdre\OneDrive\Desktop\DATA SCIENCE\workspace\soil_measures.csv')


In [53]:
# inspect dataset
df.head()



Unnamed: 0,N,P,K,ph,crop
0,90,42,43,6.502985,rice
1,85,58,41,7.038096,rice
2,60,55,44,7.840207,rice
3,74,35,40,6.980401,rice
4,78,42,42,7.628473,rice


In [54]:
#check for missing values
df.isna().sum()

N       0
P       0
K       0
ph      0
crop    0
dtype: int64

In [55]:
#check how many crops 
df.crop.unique()

array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

In [56]:
"""
"N": Nitrogen content ratio in the soil
"P": Phosphorous content ratio in the soil
"K": Potassium content ratio in the soil
"pH" value of the soil
"crop": categorical values that contain various crops (target variable). """
 
#create array of features    
features = ["N","P","K","ph"]

In [57]:
#create an emprty dictionary to store feature performance 
feature_performance = {}


In [58]:
# Split into feature and target set 

X = df.drop(columns="crop")
y = df["crop"]

In [59]:
print(X.shape, y.shape)

(2200, 4) (2200,)


In [60]:
print(type(X), type(y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [63]:
#instantiate Logistic Regression; loop through features' & train logistic regression model for each feature
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42
)


In [64]:
for feature in features:
    log_reg = LogisticRegression(solver='lbfgs', max_iter=5000, multi_class = "multinomial")
    log_reg.fit(X_train[[feature]], y_train)
    y_pred = log_reg.predict(X_test[[feature]])
    #calculate F1_score for each feature to evaluate performance of each feature
    f1 = metrics.f1_score(y_test, y_pred, average = "weighted")
    #create feature-score pairs
    feature_performance[feature] = f1
    print(f"F1 score for {feature}:{f1}")



F1 score for N:0.10677892163802272
F1 score for P:0.12167221934598027
F1 score for K:0.19730431018300978
F1 score for ph:0.06787631271947597


In [73]:
#using metrics.balanced_accuracy_score() to evaluate oerformance of each feature
for feature in features:
    log_reg = LogisticRegression(solver='lbfgs', max_iter=5000, multi_class = "multinomial")
    log_reg.fit(X_train[[feature]], y_train)
    y_pred = log_reg.predict(X_test[[feature]])
    balanced_accuracy_score = metrics.balanced_accuracy_score(y_test, y_pred, sample_weight=None, adjusted=False)
    feature_performance[feature] = balanced_accuracy_score
    print(f"Balanced_accuracy_score for {feature}: {balanced_accuracy_score}")
    

    
    

Balanced_accuracy_score for N: 0.16211462305023777
Balanced_accuracy_score for P: 0.2008150229552229
Balanced_accuracy_score for K: 0.29982703636853836
Balanced_accuracy_score for ph: 0.12377246254905522


In [75]:
#"K" produced the best scores