<a href="https://colab.research.google.com/github/mohanasamanya/MachineLearning/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

diabetes_dataset = pd.read_csv('/content/diabetes.csv')
missing_values_percentage = diabetes_dataset.isnull().mean() * 100
print(f'Missing Values Percentage (Diabetes):\n{missing_values_percentage}')
cleaned_diabetes_data = diabetes_dataset.loc[:,missing_values_percentage < 0.3]
X_features = cleaned_diabetes_data.drop(columns=['Outcome'])
y_target = cleaned_diabetes_data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X_features,y_target, test_size=0.3, random_state=42)
diabetes_model = LogisticRegression(max_iter=200)
diabetes_model.fit(X_train, y_train)
y_pred = diabetes_model.predict(X_test)
model_accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy after Missing Value Filter (Diabetes): {model_accuracy:.4f}')

correlation_matrix = cleaned_diabetes_data.corr()
highly_correlated_features = correlation_matrix[correlation_matrix > 0.8]

import numpy as np

correlated_pairs = []
for col in highly_correlated_features.columns:
  for index in highly_correlated_features.index:
      if highly_correlated_features.loc[index,col] > 0.8 and col != index and (col, index) not in correlated_pairs and (index, col) not in correlated_pairs:
        correlated_pairs.append((col,index))
features_to_remove = [pair[1] for pair in correlated_pairs] #Only remove second feature from each pair for simplicity
X_features_filtered = X_features.drop(columns=features_to_remove)

X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    X_features_filtered, y_target, test_size=0.3, random_state=42
)
diabetes_model_filtered = LogisticRegression(max_iter=200)
diabetes_model_filtered.fit(X_train_filtered, y_train_filtered)
y_pred_filtered = diabetes_model_filtered.predict(X_test_filtered)
model_accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)
print(f'Accuracy after Correlation Filter (Diabetes): {model_accuracy_filtered:.4f}')



from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.1)
X_features_filtered = selector.fit_transform(X_features)

selected_feature_indices = selector.get_support(indices=True)
selected_features = X_features.columns[selected_feature_indices]

X_features_filtered = pd.DataFrame(X_features_filtered, columns=selected_features, index=X_features.index)

X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    X_features_filtered, y_target, test_size=0.3, random_state=42
)

diabetes_model_filtered = LogisticRegression(max_iter=200)
diabetes_model_filtered.fit(X_train_filtered, y_train_filtered)

y_pred_filtered = diabetes_model_filtered.predict(X_test_filtered)
model_accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)
print(f'Accuracy after Low Variance Filter (Diabetes): {model_accuracy_filtered:.4f}')

Missing Values Percentage (Diabetes):
Pregnancies                 0.0
Glucose                     0.0
BloodPressure               0.0
SkinThickness               0.0
Insulin                     0.0
BMI                         0.0
DiabetesPedigreeFunction    0.0
Age                         0.0
Outcome                     0.0
dtype: float64
Accuracy after Missing Value Filter (Diabetes): 0.7359
Accuracy after Correlation Filter (Diabetes): 0.7359
Accuracy after Low Variance Filter (Diabetes): 0.7359


In [2]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


model = LogisticRegression(max_iter=200)

selector = SequentialFeatureSelector(
    model,
    n_features_to_select='auto',
    direction='forward',
    scoring='accuracy',
    cv=5
)

selector.fit(X_train, y_train)

selected_features = X_train.columns[selector.get_support()]


X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)
model.fit(X_train_selected, y_train)


y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)

print(f"Optimal number of features: {len(selected_features)}")
print(f"Selected features: {selected_features}")
print(f"Accuracy with selected features: {accuracy:.4f}")

Optimal number of features: 4
Selected features: Index(['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype='object')
Accuracy with selected features: 0.7359


In [3]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier()

selector = SequentialFeatureSelector(
    model,
    n_features_to_select='auto',
    direction='backward',
    scoring='accuracy',
    cv=5
)

selector.fit(X_train, y_train)


selected_features = X_train.columns[selector.get_support()]


X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)
model.fit(X_train_selected, y_train)

y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)

print(f"Optimal number of features: {len(selected_features)}")
print(f"Selected features: {selected_features}")
print(f"Accuracy with selected features: {accuracy:.4f}")

Optimal number of features: 4
Selected features: Index(['Pregnancies', 'Glucose', 'BMI', 'Age'], dtype='object')
Accuracy with selected features: 0.6710


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

feature_importances = rf_model.feature_importances_


feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

top_5_features = feature_importance_df['Feature'][:5].tolist()
X_train_reduced = X_train[top_5_features]
X_test_reduced = X_test[top_5_features]

reduced_model = LogisticRegression(max_iter=200)
reduced_model.fit(X_train_reduced, y_train)

y_pred_reduced = reduced_model.predict(X_test_reduced)
accuracy_reduced = accuracy_score(y_test, y_pred_reduced)

print(f"Top 5 features: {top_5_features}")
print(f"Accuracy with reduced features: {accuracy_reduced:.4f}")

Top 5 features: ['Glucose', 'BMI', 'Age', 'DiabetesPedigreeFunction', 'BloodPressure']
Accuracy with reduced features: 0.7273
