In [None]:
import pandas as pd

In [None]:
len(pd.read_csv('/content/heart.csv'))
df = (pd.read_csv('/content/heart.csv'))
df = df[['Age', 'RestingBP', 'Cholesterol', 'FastingBS',
       'MaxHR',  'Oldpeak', 'HeartDisease']]
df['HeartDisease'] = df['HeartDisease'].astype('str')
df.rename(columns = {'HeartDisease': 'label'}, inplace = True)
df1 = df[:450]
df2 = df[451:900]
df1.to_csv('new_data.csv')
df1.columns

In [None]:
from sklearn.model_selection import train_test_split
features = df2.drop('label', axis=1)
target = df2['label']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)
data_old =X_train

# Assessing the need for retraining



In [None]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

def monitor_model_performance(model, X_test, y_test):
  predictions = model.predict(X_test)
  accuracy = accuracy_score(y_test, predictions)
  return accuracy # Assuming threshold_accuracy is the minimum acceptable accuracy
threshold_accuracy = 0.85
  # Example: Using a RandomForest Classifier as a pre-trained model
  # Load your pre-trained model (this is just a placeholder for your actual model)
pretrained_model = RandomForestClassifier()
pretrained_model.fit(X_train, y_train)

current_accuracy = monitor_model_performance(pretrained_model, X_test, y_test)
print(f"Pre trained Model Accuracy: {current_accuracy}")
if current_accuracy < threshold_accuracy:
  print("Model drift detected. Accuracy has fallen below the threshold.")
else:
  print("Model is performing within acceptable limits.")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
# Example: Loading new data for retraining
new_dataset = pd.read_csv('new_data.csv', index_col=0)
new_dataset['label'] = new_dataset['label'].astype('str')
# Preprocessing steps
# Assume 'label' as the target variable
features = new_dataset.drop('label', axis=1)
target = new_dataset['label']
# Split the data into training and test sets
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(features, target, test_size=0.2, random_state=0)
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(X_train_new, y_train_new, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2


In [None]:
X_train_sample = X_train_new
y_train_sample = y_train_new
X_val_sample = X_val_new
y_val_sample = y_val_new
X_initial = X_train
y_initial = y_train
X_new = X_train_new
y_new = y_train_new

In [None]:
import numpy as np
from scipy.stats import ks_2samp
def detect_drift(model, data_old, data_new):
  """ Detects model drift by comparing model predictions on two different datasets.
  :param model: Trained classification model.
  :param data_old: Older dataset (numpy array).
  :param data_new: Newer dataset (numpy array).
  :return: KS statistic and p-value. """
  # Generate predictions for both datasets
  preds_old = model.predict_proba(data_old)[:,1]
  # Assuming binary classification
  preds_new = model.predict_proba(data_new)[:, 1]
  # Perform Kolmogorov-Smirnov test
  ks_stat, p_value = ks_2samp(preds_old, preds_new)
  return ks_stat, p_value
# Using above function to detect model drift
initial_data = (X_initial) # Your initial dataset
new_data = (X_new) # Your new dataset
ks_stat, p_value = detect_drift(pretrained_model, initial_data, new_data)
print(f"KS Statistic: {ks_stat}, P-value: {p_value}")

# Retraining the model

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
# Example algorithms
model_rf = RandomForestClassifier()
model_gb = GradientBoostingClassifier()
# Fit models on a sample of the new data
model_rf.fit(X_train_sample, y_train_sample)
model_gb.fit(X_train_sample, y_train_sample)
# Evaluate and compare
accuracy_rf = accuracy_score(y_val_sample, model_rf.predict(X_val_sample))
accuracy_gb = accuracy_score(y_val_sample, model_gb.predict(X_val_sample))
print(f"Random Forest Accuracy: {accuracy_rf}, Gradient Boosting Accuracy: {accuracy_gb}")

In [None]:
from sklearn.model_selection import GridSearchCV
# Set up hyperparameter grid
param_grid = { 'n_estimators': [100, 200], 'max_depth': [10, 20, 30] }
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model_rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
# Perform grid search on the training data
grid_search.fit(X_train_new, y_train_new)
# Best parameters
print("Best Parameters:", grid_search.best_params_)

In [None]:
import joblib
# Train the model with optimal parameters
optimal_model = RandomForestClassifier(n_estimators=200, max_depth=30)
optimal_model.fit(X_train_new, y_train_new)
# Save the retrained model
joblib.dump(optimal_model, "retrained_model.joblib")
# Evaluate the retrained model
retrained_predictions = optimal_model.predict(X_test_new)
print(f"Retrained Model Accuracy: {accuracy_score(y_test_new, retrained_predictions)}")

# Active model selection and deployment

In [None]:
from sklearn.metrics import accuracy_score
# Assuming you have a retrained model and a baseline model
retrained_model_accuracy = accuracy_score(y_test_new, optimal_model.predict(X_test_new))
baseline_model_accuracy = accuracy_score(y_test_new, pretrained_model.predict(X_test_new))

# Define performance threshold
performance_threshold = baseline_model_accuracy * 1.05  # 5% improvement
# Check if the retrained model meets the threshold
if retrained_model_accuracy >= performance_threshold:
  print("Retrained model meets the performance threshold.")
else:
  print("Retrained model does not meet the performance threshold.")

In [None]:
users = pd.DataFrame({
    'id': [1,2,3,4,5],
    'role':['user','user','user','user','user'],
})

In [None]:
def assign_model(user_id):
  return 'retrained_model' if np.random.rand() < 0.5 else 'baseline_model'

In [None]:
for index, row in users.iterrows():
  model_assigned = assign_model(row.id)

In [None]:
import random
import numpy as np

def old_model_predict(input_data):
    # Placeholder for the old model's prediction logic
    return "old_model_prediction"

def new_model_predict(input_data):
    # Placeholder for the new model's prediction logic
    return "new_model_prediction"

def gradual_rollout(input_data, rollout_percentage):
    """
    Gradually rollout the new model based on the specified percentage.
    :param input_data: The input data for prediction.
    :param rollout_percentage: Percentage of traffic to direct to the new model.
    :return: The model's prediction.
    """
    if random.random() < rollout_percentage:
        return new_model_predict(input_data)
    else:
        return old_model_predict(input_data)

# Example usage
rollout_percentage = 0.10  # Start with 10% of the traffic to the new model
input_data = np.random.rand(20)  # Example input data

# Simulating requests
for _ in range(100):
    prediction = gradual_rollout(input_data, rollout_percentage)
    print(prediction)

    # Based on monitoring, gradually increase the rollout_percentage
    # This increment can be based on time or performance metrics
    # e.g., rollout_percentage += 0.05