# Modeling

### import libraries 

In [1]:
# Import the pandas library and alias it as pd
import pandas as pd

# Import the train_test_split module from scikit-learn to split the data into training and testing sets
from sklearn.model_selection import train_test_split,GridSearchCV

# Import the RandomForestClassifier model from scikit-learn
from sklearn.ensemble import RandomForestClassifier

# Import necessary modules for evaluating the model's performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import pickle

### read data

In [2]:
# Read the CSV file into a DataFrame using pandas
data = pd.read_csv(r"C:\Users\moham\OneDrive\Documents\my track\Internships\Cellula\Modeling\data.csv")

In [3]:
data.select_dtypes('object').columns

Index([], dtype='object')

### spliting data to ( X_train , X_test , Y_train , Y_test )

In [4]:
# Extract features (X) by dropping the 'output' column
X = data.drop('output', axis=1)

# Extract target variable (Y) from the 'output' column
Y = data['output']

In [5]:
# Split the data into training and testing sets using train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

### The model


In [6]:
# Create a RandomForestClassifier model
model = RandomForestClassifier()

In [7]:
# Define hyperparameter grid for Grid Search with fewer values
param_grid_rf = {
    'n_estimators': [50, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Set up Grid Search for Random Forest
grid_search_rf = GridSearchCV(estimator=model, param_grid=param_grid_rf, scoring='accuracy', cv=5)

# Fit Grid Search to training data
grid_search_rf.fit(X_train, Y_train)

# Get the best model from Grid Search
best_model = grid_search_rf.best_estimator_

In [8]:
pickle.dump(best_model,open("model.pkl","wb"))

### Model Evaluation 

In [9]:
# Make predictions on the training set using the trained model
X_train_prediction = best_model.predict(X_train)

# Calculate and print the accuracy on the training data
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print(f'Training Data Accuracy: {training_data_accuracy:.2%}')

Training Data Accuracy: 99.37%


In [10]:
# Make predictions on the test set using the trained model
X_test_prediction = best_model.predict(X_test)

# Calculate accuracy on the test data and print it
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print(f'Test Data Accuracy: {test_data_accuracy:.2%}\n\n')

# Print the classification report for the test data
print('Classification Report:\n', classification_report(X_test_prediction, Y_test), '\n')

# Print the confusion matrix for the test data
print('Confusion Matrix:\n', confusion_matrix(X_test_prediction, Y_test), '\n')

Test Data Accuracy: 89.39%


Classification Report:
               precision    recall  f1-score   support

         0.0       0.77      0.85      0.81      1502
         1.0       0.95      0.91      0.93      4183

    accuracy                           0.89      5685
   macro avg       0.86      0.88      0.87      5685
weighted avg       0.90      0.89      0.90      5685
 

Confusion Matrix:
 [[1282  220]
 [ 383 3800]] 

