In [48]:
import logging
from src.utils.config import load_config
from src.data.load_data import load_raw_data
from src.data.preprocess import preprocess_data
from src.features.build_features import create_features
from src.models.train_model import train_model
from src.models.evaluate_model import evaluate_model
from src.models.predict import predict_fruit
import joblib

config = load_config('configs/training_config.yaml')
logging.basicConfig(level=logging.INFO, filename=config['paths']['log_path'], 
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [49]:
logging.info("Starting the data pipeline.")
logging.info("Loading raw data.")
df = load_raw_data(config['data']['raw_data_path'])

In [50]:
df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [51]:
logging.info("Preprocessing Data")
df_processed = preprocess_data(df)

In [52]:
df_processed.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,0.529442,1.59869,-0.291397,-2.793595
1,1,apple,granny_smith,0.309462,1.104854,-0.661922,-2.268684
2,1,apple,granny_smith,0.236136,0.3641,-0.365502,-2.137456
3,2,mandarin,mandarin,-1.413709,-1.117409,-2.218131,0.4871
4,2,mandarin,mandarin,-1.450372,-1.364327,-2.292236,0.355872


In [53]:
logging.info("Feature Engineering")
df_features = create_features(df_processed)
df_features.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score,area,density,aspect_ratio
0,1,apple,granny_smith,0.529442,1.59869,-0.291397,-2.793595,-0.465853,-1.1365,-0.182272
1,1,apple,granny_smith,0.309462,1.104854,-0.661922,-2.268684,-0.731328,-0.423152,-0.599104
2,1,apple,granny_smith,0.236136,0.3641,-0.365502,-2.137456,-0.133079,-1.774404,-1.003851
3,2,mandarin,mandarin,-1.413709,-1.117409,-2.218131,0.4871,2.47856,-0.570375,1.985066
4,2,mandarin,mandarin,-1.450372,-1.364327,-2.292236,0.355872,3.12736,-0.463769,1.680122


In [54]:
logging.info("Data Split")
X = df_features[config['features']['numerical']]
y = df_features['fruit_label']  

In [55]:
logging.info("Model Training")
model, _ = train_model(X, y, config)

In [56]:
logging.info("Saving the Model")
joblib.dump(model, config['paths']['model_save_path'])

['models/fruit_classifier.joblib']

In [57]:
logging.info("Evaluation the Model")
metrics = evaluate_model(model, _['X_test'], _['y_test'], save_path=config['paths']['performance_path'])

In [58]:
metrics

{'classification_report': {'1': {'precision': 1.0,
   'recall': 0.75,
   'f1-score': 0.8571428571428571,
   'support': 4.0},
  '2': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1.0},
  '3': {'precision': 0.8,
   'recall': 1.0,
   'f1-score': 0.8888888888888888,
   'support': 4.0},
  '4': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3.0},
  'accuracy': 0.9166666666666666,
  'macro avg': {'precision': 0.95,
   'recall': 0.9375,
   'f1-score': 0.9365079365079365,
   'support': 12.0},
  'weighted avg': {'precision': 0.9333333333333332,
   'recall': 0.9166666666666666,
   'f1-score': 0.9153439153439153,
   'support': 12.0}},
 'confusion_matrix': [[3, 0, 1, 0], [0, 1, 0, 0], [0, 0, 4, 0], [0, 0, 0, 3]]}

In [59]:
result = predict_fruit(model, _['X_test'])
result

['orange',
 'apple',
 'orange',
 'apple',
 'mandarin',
 'lemon',
 'lemon',
 'orange',
 'lemon',
 'orange',
 'orange',
 'apple']