In [51]:
#1)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import ast # Import ast to safely evaluate strings as Python literals

# Load your dataset (assuming it's in a DataFrame format)
# Replace 'your_dataframe.csv' with the path to your dataset
df = pd.read_csv('/content/final_encoded_disease_csv.csv') # Removed extra space here

# Assuming your dataset is already loaded into a DataFrame called df
# Define features (X) and target (y)
X = df.drop('HadAsthmaIndexed', axis=1)  # Drop the target column from the features
y = df['HadAsthmaIndexed']  # Target column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Identify columns with non-numeric values
non_numeric_columns = X_train.select_dtypes(exclude=['number']).columns

# Convert non-numeric columns to numeric (replace with appropriate conversion logic)
for col in non_numeric_columns:
    # Example conversion (modify based on your data):
    # If the column contains lists or tuples as strings, you'll need to extract
    # the numeric values and potentially create new features for each element.
    # Here's an improved example assuming the string represents a list of tuples:
    try:
        # Attempt to extract numeric values from the string representation
        X_train[col] = X_train[col].apply(lambda x: [float(t[2][0]) for t in ast.literal_eval(x)] if isinstance(ast.literal_eval(x), list) else x) # Use ast.literal_eval for safe evaluation
        X_test[col] = X_test[col].apply(lambda x: [float(t[2][0]) for t in ast.literal_eval(x)] if isinstance(ast.literal_eval(x), list) else x) # Apply the same conversion to the test set
    except:
        # Handle cases where conversion might fail
        X_train[col] = X_train[col].apply(lambda x: None if not isinstance(x, (int, float)) else x)
        X_test[col] = X_test[col].apply(lambda x: None if not isinstance(x, (int, float)) else x) # Handle exceptions in the test set as well

# Handle any remaining missing values (replace with your preferred imputation strategy)
# Impute missing values using the mean of each column (for numeric columns only)
numeric_columns = X_train.select_dtypes(include=['number']).columns
X_train[numeric_columns] = X_train[numeric_columns].fillna(X_train[numeric_columns].mean())
X_test[numeric_columns] = X_test[numeric_columns].fillna(X_test[numeric_columns].mean())

# Explicitly convert all columns to numeric, coercing errors to NaN
# This step ensures all data is numeric, replacing any remaining non-numeric values with NaN
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Fill any newly created NaNs with 0 (or another appropriate strategy)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

cv_scores = cross_val_score(rf, X_train, y_train, cv=5)  # 5-fold cross-validation
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average Cross-Validation Score: {cv_scores.mean():.2f}")

# Train the model
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Cross-Validation Scores: [0.85498788 0.85471851 0.85528285 0.85558216 0.85498354]
Average Cross-Validation Score: 0.86
Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92     61053
           1       0.58      0.06      0.11     10542

    accuracy                           0.86     71595
   macro avg       0.72      0.53      0.52     71595
weighted avg       0.82      0.86      0.80     71595



In [55]:
#2)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb

# Load your dataset (assuming it's in a DataFrame format)
# Replace 'your_dataframe.csv' with the path to your dataset
df = pd.read_csv('/content/final_encoded_disease_csv.csv') # Removed extra space here

# Assuming your dataset is already loaded into a DataFrame called df
# Define features (X) and target (y)
X = df.drop('HadAsthmaIndexed', axis=1)  # Drop the target column from the features
y = df['HadAsthmaIndexed']  # Target column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Explicitly convert object columns to categorical if they represent categorical data
# Modify the column names in the list as needed
for col in ['SmokerStatusVec', 'AgeCategoryVec']:
    if X_train[col].dtype == 'object':
        X_train[col] = X_train[col].astype('category')
    if X_test[col].dtype == 'object':
        X_test[col] = X_test[col].astype('category')

# Initialize the XGBoost classifier
# Enable categorical features handling
xgb_model = xgb.XGBClassifier(use_label_encoder=False,
                              eval_metric='logloss',
                              random_state=42,
                              enable_categorical=True)  # Enable categorical features

# Use xgb_model instead of rf for cross-validation
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5)  # 5-fold cross-validation
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average Cross-Validation Score: {cv_scores.mean():.2f}")

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-Validation Scores: [0.85474844 0.85591572 0.85306794 0.85264891 0.85399581]
Average Cross-Validation Score: 0.85


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92     61053
           1       0.53      0.10      0.17     10542

    accuracy                           0.85     71595
   macro avg       0.70      0.54      0.54     71595
weighted avg       0.81      0.85      0.81     71595

Confusion Matrix:
[[60145   908]
 [ 9508  1034]]


In [56]:
#3)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import ast # Import ast to safely evaluate strings as Python literals

# Load your dataset (assuming it's in a DataFrame format)
# Replace 'your_dataframe.csv' with the path to your dataset
df = pd.read_csv('/content/final_encoded_disease_csv.csv') # Removed extra space before this line

# Assuming your dataset is already loaded into a DataFrame called df
# Define features (X) and target (y)
X = df.drop('HadAsthmaIndexed', axis=1)  # Drop the target column from the features
y = df['HadAsthmaIndexed']  # Target column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Identify columns with non-numeric values in the training set
non_numeric_columns = X_train.select_dtypes(exclude=['number']).columns

# Convert non-numeric columns to numeric
for col in non_numeric_columns:
    try:
        # Attempt to extract numeric values from the string representation
        X_train[col] = X_train[col].apply(lambda x: [float(t[2][0]) for t in ast.literal_eval(x)] if isinstance(ast.literal_eval(x), list) else x)
        X_test[col] = X_test[col].apply(lambda x: [float(t[2][0]) for t in ast.literal_eval(x)] if isinstance(ast.literal_eval(x), list) else x) # Apply same transformation to test set
    except:
        # Handle cases where conversion might fail
        X_train[col] = X_train[col].apply(lambda x: None if not isinstance(x, (int, float)) else x)
        X_test[col] = X_test[col].apply(lambda x: None if not isinstance(x, (int, float)) else x) # Handle exceptions in test set

# Handle any remaining missing values (replace with your preferred imputation strategy)
# Impute missing values using the mean of each column (for numeric columns only)
numeric_columns = X_train.select_dtypes(include=['number']).columns
X_train[numeric_columns] = X_train[numeric_columns].fillna(X_train[numeric_columns].mean())
X_test[numeric_columns] = X_test[numeric_columns].fillna(X_test[numeric_columns].mean())

# Explicitly convert all columns to numeric, coercing errors to NaN
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Fill any newly created NaNs with 0 (or another appropriate strategy)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# Check if any non-numeric values remain after conversion and imputation
non_numeric_after_conversion = X_train.select_dtypes(exclude=['number']).columns
if non_numeric_after_conversion.size > 0:
    print("Warning: The following columns are still non-numeric after conversion:")
    print(non_numeric_after_conversion)
    # Handle these columns appropriately (e.g., drop them, one-hot encode them)

# Initialize the Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)

cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5)  # 5-fold cross-validation
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average Cross-Validation Score: {cv_scores.mean():.2f}")

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-Validation Scores: [0.85498788 0.8551076  0.85438491 0.85429512 0.85507333]
Average Cross-Validation Score: 0.85
Accuracy: 0.85
Classification Report


In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import ast  # Import ast to safely evaluate strings as Python literals

# Load your dataset (assuming it's in a DataFrame format)
df = pd.read_csv('/content/final_encoded_disease_csv.csv')

# Define features (X) and target (y)
X = df.drop('HadAsthmaIndexed', axis=1)  # Drop the target column from the features
y = df['HadAsthmaIndexed']  # Target column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Identify columns with non-numeric values
non_numeric_columns = X_train.select_dtypes(exclude=['number']).columns

# Convert non-numeric columns to numeric
for col in non_numeric_columns:
    try:
        X_train[col] = X_train[col].apply(lambda x: [float(t[2][0]) for t in ast.literal_eval(x)] if isinstance(ast.literal_eval(x), list) else x)
        X_test[col] = X_test[col].apply(lambda x: [float(t[2][0]) for t in ast.literal_eval(x)] if isinstance(ast.literal_eval(x), list) else x)
    except:
        X_train[col] = X_train[col].apply(lambda x: None if not isinstance(x, (int, float)) else x)
        X_test[col] = X_test[col].apply(lambda x: None if not isinstance(x, (int, float)) else x)

# Handle missing values
numeric_columns = X_train.select_dtypes(include=['number']).columns
X_train[numeric_columns] = X_train[numeric_columns].fillna(X_train[numeric_columns].mean())
X_test[numeric_columns] = X_test[numeric_columns].fillna(X_test[numeric_columns].mean())

# Explicitly convert all columns to numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Fill any newly created NaNs with 0
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# Initialize the Support Vector Classifier
svc = SVC(kernel='linear', random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(svc, X_train, y_train, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average Cross-Validation Score: {cv_scores.mean():.2f}")

# Train the model
svc.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svc.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


FileNotFoundError: [Errno 2] No such file or directory: '/content/final_encoded_disease_csv.csv'