In [1]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Define the number of samples
num_samples = 1000

# Generate random data for features
age = np.random.randint(1, 90, num_samples)  # Age between 1 and 90
fever = np.random.choice([0, 1], num_samples)  # Fever presence (0 = no, 1 = yes)
chills = np.random.choice([0, 1], num_samples)  # Chills presence (0 = no, 1 = yes)
fatigue = np.random.choice([0, 1], num_samples)  # Fatigue presence (0 = no, 1 = yes)
travel_history = np.random.choice([0, 1], num_samples)  # Travel history to endemic area (0 = no, 1 = yes)
mosquito_presence = np.random.choice([0, 1], num_samples)  # Mosquito presence in the area (0 = no, 1 = yes)
temperature = np.random.uniform(20, 40, num_samples)  # Temperature between 20°C and 40°C

# Generate the outcome with a 50:50 probability (0 or 1 for infection)
outcome = np.random.choice([0, 1], num_samples)

# Create the dataframe
data = pd.DataFrame({
    'age': age,
    'fever': fever,
    'chills': chills,
    'fatigue': fatigue,
    'travel_history': travel_history,
    'mosquito_presence': mosquito_presence,
    'temperature': temperature,
    'outcome': outcome
})

# Display the first few rows of the dataset
data.head()


Unnamed: 0,age,fever,chills,fatigue,travel_history,mosquito_presence,temperature,outcome
0,52,1,0,1,1,1,28.294717,1
1,15,0,0,1,0,0,35.077301,1
2,72,0,0,0,1,0,37.429368,1
3,61,1,0,0,1,0,33.57957,0
4,21,0,1,0,1,1,28.237295,1


In [2]:
# Save the dataset to a CSV file
file_path = './malaria_dataset.csv'
data.to_csv(file_path, index=False)

file_path


'./malaria_dataset.csv'

In [3]:
data.head(8)

Unnamed: 0,age,fever,chills,fatigue,travel_history,mosquito_presence,temperature,outcome
0,52,1,0,1,1,1,28.294717,1
1,15,0,0,1,0,0,35.077301,1
2,72,0,0,0,1,0,37.429368,1
3,61,1,0,0,1,0,33.57957,0
4,21,0,1,0,1,1,28.237295,1
5,83,0,0,1,0,1,35.071893,0
6,87,0,0,0,0,0,24.073343,0
7,75,0,1,1,1,1,26.551043,1


In [5]:
X = data.drop("outcome", axis=1)

In [6]:
X.head()

Unnamed: 0,age,fever,chills,fatigue,travel_history,mosquito_presence,temperature
0,52,1,0,1,1,1,28.294717
1,15,0,0,1,0,0,35.077301
2,72,0,0,0,1,0,37.429368
3,61,1,0,0,1,0,33.57957
4,21,0,1,0,1,1,28.237295


In [7]:
y = data["outcome"]

In [8]:
y.head(6)

0    1
1    1
2    1
3    0
4    1
5    0
Name: outcome, dtype: int32

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 7), (200, 7), (800,), (200,))

In [12]:
#since we are working on a classificatiopn problem, we will work with a random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [13]:
#enstantiate the model
clf = RandomForestClassifier()

In [14]:
#view the current parameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [15]:
clf.fit (X=X_train, y=y_train)

In [16]:
# in order to make prediction on the label data, it has to be on the same shape.
X_train.head()

Unnamed: 0,age,fever,chills,fatigue,travel_history,mosquito_presence,temperature
382,51,1,1,1,1,1,25.052921
36,62,1,1,0,1,0,37.483414
673,46,0,1,0,0,0,34.25692
565,76,1,1,0,1,0,20.63557
164,28,1,0,1,0,0,38.671422


In [17]:
#use the model to make predicition on the test data set
y_preds = clf.predict(X=X_test)

In [18]:
#evaluate the model on the train dataset
train_acc = clf.score (X=X_train, y=y_train)
print(f"The model's accuracy on the train dataset is : {train_acc*100}%")

The model's accuracy on the train dataset is : 100.0%


In [19]:
#evaluate the model on the test dataset
test_acc = clf.score (X=X_test, y=y_test)
print(f"The model's accuracy on the train dataset is : {test_acc*100:.2f}%")

The model's accuracy on the train dataset is : 46.00%


In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [21]:
#create a classifictaion report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.47      0.50      0.49       102
           1       0.45      0.42      0.43        98

    accuracy                           0.46       200
   macro avg       0.46      0.46      0.46       200
weighted avg       0.46      0.46      0.46       200



In [22]:
#create confusin matrix
conf_mat = confusion_matrix(y_preds, y_test )
conf_mat

array([[51, 57],
       [51, 41]], dtype=int64)

In [23]:
# create accuracy score
accuracy_score (y_preds, y_test)

0.46

In [24]:
#note that RandomForestClassifier has 100 estimators by default, try another number of estimators
#try different numbers of estimators (no cross validation)
np.random.seed(42)

for i in range(100,200,10):
  print(f"trying model with {i} estimators...")
  model = RandomForestClassifier (n_estimators = i). fit(X_train, y_train)
  print (f"model accuracy on test set: {model.score(X_test, y_test)*100:.2f}%")
  print ("")

trying model with 100 estimators...
model accuracy on test set: 45.50%

trying model with 110 estimators...
model accuracy on test set: 46.00%

trying model with 120 estimators...
model accuracy on test set: 46.00%

trying model with 130 estimators...
model accuracy on test set: 45.50%

trying model with 140 estimators...
model accuracy on test set: 46.00%

trying model with 150 estimators...
model accuracy on test set: 48.50%

trying model with 160 estimators...
model accuracy on test set: 47.00%

trying model with 170 estimators...
model accuracy on test set: 46.50%

trying model with 180 estimators...
model accuracy on test set: 43.50%

trying model with 190 estimators...
model accuracy on test set: 44.00%



In [25]:
from sklearn.model_selection import cross_val_score

In [26]:
# with cross validation
np.random.seed(42)

for i in range(100,200,10):
  print(f"trying model with {i} estimators...")
  model = RandomForestClassifier (n_estimators = i). fit(X_train, y_train)
  print (f"model accuracy on test set: {model.score(X_test, y_test)*100:.2f}%")

  #measure the model score on a single train split
  modal_score = model.score(X_test, y_test)
  print (f"model accuracy on single test set split: {model.score(X_test, y_test)*100:.2f}%")


trying model with 100 estimators...
model accuracy on test set: 45.50%
model accuracy on single test set split: 45.50%
trying model with 110 estimators...
model accuracy on test set: 46.00%
model accuracy on single test set split: 46.00%
trying model with 120 estimators...
model accuracy on test set: 46.00%
model accuracy on single test set split: 46.00%
trying model with 130 estimators...
model accuracy on test set: 45.50%
model accuracy on single test set split: 45.50%
trying model with 140 estimators...
model accuracy on test set: 46.00%
model accuracy on single test set split: 46.00%
trying model with 150 estimators...
model accuracy on test set: 48.50%
model accuracy on single test set split: 48.50%
trying model with 160 estimators...
model accuracy on test set: 47.00%
model accuracy on single test set split: 47.00%
trying model with 170 estimators...
model accuracy on test set: 46.50%
model accuracy on single test set split: 46.50%
trying model with 180 estimators...
model accura

In [27]:
#measure the mean cross validation score across 5 different train test and split
cross_val_mean = np.mean(cross_val_score(model, X,y, cv=5))
print(f"5-fold cross-valaidation score:{cross_val_mean*100:.2f}%")
print("")


5-fold cross-valaidation score:45.90%



In [28]:
#another way to do it with GridsearchCv
np.random.seed(42)
from sklearn.model_selection import GridSearchCV

In [29]:
#define the parameters to search over a dicitionary
# (these can any of your traget model's hyperparameter)
param_grid = {'n_estimators': [ i for i in range (100,200,10)]}

In [30]:
# set up the grid search
grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, verbose=1)

In [31]:
#fit the grid search to the data
grid.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [32]:
#find the best parameter
print (f"The best Parameter values are: {grid.best_params_}")
print (f"with a score of : {grid.best_score_*100:.2f}%")

The best Parameter values are: {'n_estimators': 100}
with a score of : 47.60%


In [33]:
# we can extract  the best model with the "best_estimator_" attriubte
#set the model to be the best estimator
clf = grid.best_estimator_
clf

In [34]:
# And now we have got the best cross-validation model,
# we can fitr and score it on our original single train/test spilt of the data
# fit the best model
clf = clf.fit(X_train, y_train)

In [35]:
# find the best model score on our single test spilt
#(note: this may be lower than the cross-validation score, it's only on one splilt on the data)

print(f"Best model score on single split of the data: {clf.score(X_test, y_test)*100:.2f}%")

Best model score on single split of the data: 47.50%


In [36]:
# joblib is used for heavy dataset, while pickle is used for light data
import pickle

In [37]:
# save an existing model to file
pickle.dump(model, open("random_forest_model_1.pkl", "wb"))


In [38]:
#load a save pickle model and evaluate
loaded_pickle_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
print(f"loaded pickle model prediction score: {loaded_pickle_model.score(X_test, y_test)*100:.2f}%")

loaded pickle model prediction score: 44.00%


In [39]:
data.head(2)

Unnamed: 0,age,fever,chills,fatigue,travel_history,mosquito_presence,temperature,outcome
0,52,1,0,1,1,1,28.294717,1
1,15,0,0,1,0,0,35.077301,1


In [40]:
# load a new dataset
new_data = pd.DataFrame({
    'age':[63,37,41,25],
    'fever':[1,1,0,0],
    'chills':[1,1,1,0],
    'fatigue':[1,0,0,0],
    'travel_history':[223,250,204,200],
    'mosquito_presence':[1,0,0,0],
    'temperature':[0,1,0,0],
})

In [41]:
new_data.shape

(4, 7)

In [42]:
# make prediction on the new dataset
predictions = loaded_pickle_model.predict(new_data)
predictions

array([1, 1, 0, 0])