# Modeling

### import libraries 

In [1]:
# Import pandas with alias pd
import pandas as pd

# Import train_test_split for data splitting
from sklearn.model_selection import train_test_split, GridSearchCV

# Import Logistic Regression model
from sklearn.linear_model import LogisticRegression

# Import evaluation metrics for model performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


### read data

In [2]:
# Read the CSV file into a DataFrame using pandas
data = pd.read_csv(r"C:\Users\moham\OneDrive\Documents\my track\Internships\Cellula\Modeling\data.csv")


In [3]:
data.select_dtypes('object').columns

Index([], dtype='object')

### spliting data to ( X_train , X_test , Y_train , Y_test )

In [4]:
# Extract features (X) by dropping the 'output' column
X = data.drop('output', axis=1)

# Extract target variable (Y) from the 'output' column
Y = data['output']


In [5]:
X_train , X_test , Y_train , Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=42)

### The model


In [6]:
# Create a Logistic Regression model
model = LogisticRegression()

In [7]:
# Train Logistic Regression with Grid Search

# Define hyperparameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}

# Set up Grid Search for Logistic Regression
grid_search_lr = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5)

# Fit Grid Search to training data
grid_search_lr.fit(X_train, Y_train)

best_model = grid_search_lr.best_estimator_



### Model Evaluation 

In [8]:
# Make predictions on the training set using the trained model
X_train_prediction = best_model.predict(X_train)

# Calculate and print the accuracy on the training data
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print(f'Training Data Accuracy: {training_data_accuracy:.2%}')

Training Data Accuracy: 79.65%


In [9]:
# Make predictions using the best model
Y_test_pred = best_model.predict(X_test)

# Evaluate the model performance on the test set
test_accuracy = accuracy_score(Y_test_pred, Y_test)
print(f'Test Data Accuracy: {test_accuracy:.2%}')

# Print the classification report
print('Classification Report:\n', classification_report(Y_test_pred, Y_test))

# Print the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test_pred, Y_test))

Test Data Accuracy: 78.84%
Classification Report:
               precision    recall  f1-score   support

         0.0       0.50      0.69      0.58      1208
         1.0       0.91      0.81      0.86      4477

    accuracy                           0.79      5685
   macro avg       0.70      0.75      0.72      5685
weighted avg       0.82      0.79      0.80      5685

Confusion Matrix:
 [[ 835  373]
 [ 830 3647]]
