In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def laplace_mechanism_numeric(data, sensitivity, epsilon):
    """
    Adds Laplace noise to the numerical data to achieve differential privacy.

    Parameters:
        data (float or array-like): The numerical data to be privatized.
        sensitivity (float): The sensitivity of the function used to calculate the output.
        epsilon (float): The privacy parameter controlling the amount of noise to be added.

    Returns:
        float or array-like: The privatized data.
    """
    scale = sensitivity / epsilon
    noise = np.random.laplace(scale=scale, size=len(data))
    privatized_data = data + noise
    return privatized_data

def laplace_mechanism_categorical(data, epsilon):
    """
    Adds Laplace noise to the categorical data to achieve differential privacy.

    Parameters:
        data (array-like): The categorical data to be privatized.
        epsilon (float): The privacy parameter controlling the amount of noise to be added.

    Returns:
        array-like: The privatized data.
    """
    scale = 1 / epsilon
    noise = np.random.laplace(scale=scale, size=len(data))
    noise = np.round(noise).astype(int)
    privatized_data = [data_index + noise_value for data_index, noise_value in zip(range(len(data)), noise)]
    return privatized_data

In [4]:
# Read data from CSV file
file_path = '/content/drive/MyDrive/Assignment2Dataset.csv'  # Replace with the actual path to your CSV file
df = pd.read_csv(file_path)

In [5]:
# Preprocessing: Handling null values
columns_with_nan = df.columns[df.isnull().any()]
for col in columns_with_nan:
    if df[col].dtype in [np.int64, np.float64]:
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

df.head()

Unnamed: 0,Name,Country,SSN,DOB,Income,Sex,Marital Status,Education,Loan,House Status,Blood Type,Blood Pressure,Heart Rate,Oxygen Level,Medical Procedure,Smoking,Alcohol Consumption,Allergies,Vaccinations,Tumor Condition
0,John Smith,USA,123-45-6789,7/15/1985,50000,Male,Married,Bachelor's Degree,Yes,Own,O+,120/80,72.0,98%,Appendectomy,No,No,Pollen,Yes,Normal
1,Emily Johnson,Canada,987-65-4321,12/2/1990,60000,Female,Single,Master's Degree,No,Rent,A-,110/70,68.0,96%,Laser Eye Surgery,No,Yes,Shellfish,No,Normal
2,Michael Davis,UK,456-78-9123,3/20/1978,75000,Male,Divorced,High School Diploma,Yes,Own,B+,130/85,75.0,97%,Colonoscopy,No,Yes,Cats,Yes,Abnormal
3,Jessica Martinez,Australia,789-12-3456,9/10/1982,40000,Female,Married,Associate's Degree,No,Own,AB-,115/75,70.0,99%,Mammogram,No,No,Dust,No,Normal
4,David Thompson,USA,234-56-7890,6/25/1995,35000,Male,Single,Some College,No,Rent,O-,125/80,68.0,97%,Dental Cleaning,Yes,Yes,Peanuts,Yes,Normal


In [6]:
# Preprocessing: Removing outliers (considering only numerical columns)
numerical_columns = df.select_dtypes(include=[np.number]).columns
for column in numerical_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]

# Preprocessing: Normalization (considering only numerical columns)
scaler = MinMaxScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [7]:
# Define sensitivity and privacy parameter
sensitivity_numeric = 1.0  # Example sensitivity for numerical columns
epsilon_numeric = 0.5  # Example privacy parameter for numerical columns
epsilon_categorical = 0.5  # Example privacy parameter for categorical columns

# Apply Laplace mechanism to all columns except the target column
target_column = 'Tumor Condition'
for column in df.columns:
    if column != target_column:
        if df[column].dtype in [np.int64, np.float64]:
            df[column] = laplace_mechanism_numeric(df[column], sensitivity_numeric, epsilon_numeric)
        else:
            df[column] = laplace_mechanism_categorical(df[column], epsilon_categorical)

# Display the encrypted data
print("Encrypted Data:")
print(df)

Encrypted Data:
    Name  Country  SSN  DOB     Income  Sex  Marital Status  Education  Loan  \
0     -1       -1   -1   -2   0.615210    0               1          3     0   
1      0        0    5    0   1.148241    0               1         -2     3   
2      0        9    0    4  -4.299632    2               2          5     5   
3      4       -4    6    3  -3.203378    3               5          9     4   
4      3        4    3    4   0.922939    4              10          6     2   
5      4        4   16    3   1.084413    6               7         10     7   
6      6        6    5    6   0.583123    5               6          3     5   
7     -2        5   13    8   1.736955    7               6          6     1   
8      7        8   10    8   0.852315   10               8         10     8   
9      9       12   10   12   1.181333    9              15          9     9   
10     7       12   10    9   2.896383    5              10         12    19   
11    14       11   10  

In [11]:

# Split the data into features (X) and target variable (y)
X = df.drop(columns=[target_column])  # Exclude the target column
y = df[target_column]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Gradient Boosting Classifier
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

# Evaluate the Gradient Boosting model
gb_y_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_y_pred)
print("Gradient Boosting Accuracy:", gb_accuracy)

Gradient Boosting Accuracy: 0.9166666666666666


In [9]:
# Train a Support Vector Machine (SVM) classifier
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Evaluate the SVM model
svm_y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print("SVM Accuracy:", svm_accuracy)



SVM Accuracy: 0.9166666666666666


In [12]:
# Save the encrypted dataset to a CSV file
encrypted_file_path = '/content/drive/MyDrive/Encrypted_Dataset.csv'  # Define the file path where you want to save the encrypted dataset
df.to_csv(encrypted_file_path, index=False)

# Confirm that the file has been saved
print("Encrypted dataset saved successfully at:", encrypted_file_path)


Encrypted dataset saved successfully at: /content/drive/MyDrive/Encrypted_Dataset.csv
