In [1]:
import pandas as pd

In [2]:
# Load the dataset while keeping 'None' as a string instead of converting it to NaN
osteoporosis_data = pd.read_csv('osteoporosis.csv', keep_default_na=False)

In [3]:
# Display basic information about the dataset
print(osteoporosis_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1958 entries, 0 to 1957
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Id                   1958 non-null   int64 
 1   Age                  1958 non-null   int64 
 2   Gender               1958 non-null   object
 3   Hormonal Changes     1958 non-null   object
 4   Family History       1958 non-null   object
 5   Race/Ethnicity       1958 non-null   object
 6   Body Weight          1958 non-null   object
 7   Calcium Intake       1958 non-null   object
 8   Vitamin D Intake     1958 non-null   object
 9   Physical Activity    1958 non-null   object
 10  Smoking              1958 non-null   object
 11  Alcohol Consumption  1958 non-null   object
 12  Medical Conditions   1958 non-null   object
 13  Medications          1958 non-null   object
 14  Prior Fractures      1958 non-null   object
 15  Osteoporosis         1958 non-null   int64 
dtypes: int

In [4]:
# Show the dataset
osteoporosis_data

Unnamed: 0,Id,Age,Gender,Hormonal Changes,Family History,Race/Ethnicity,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis
0,1734616,69,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,Yes,Moderate,Rheumatoid Arthritis,Corticosteroids,Yes,1
1,1419098,32,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,No,,,,Yes,1
2,1797916,89,Female,Postmenopausal,No,Caucasian,Normal,Adequate,Sufficient,Active,No,Moderate,Hyperthyroidism,Corticosteroids,No,1
3,1805337,78,Female,Normal,No,Caucasian,Underweight,Adequate,Insufficient,Sedentary,Yes,,Rheumatoid Arthritis,Corticosteroids,No,1
4,1351334,38,Male,Postmenopausal,Yes,African American,Normal,Low,Sufficient,Active,Yes,,Rheumatoid Arthritis,,Yes,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1953,1991635,19,Female,Normal,Yes,African American,Normal,Adequate,Sufficient,Sedentary,Yes,Moderate,Rheumatoid Arthritis,,Yes,0
1954,1528601,23,Female,Postmenopausal,Yes,Caucasian,Underweight,Low,Insufficient,Active,No,,,Corticosteroids,No,0
1955,1990957,34,Female,Postmenopausal,No,African American,Underweight,Low,Sufficient,Sedentary,No,,Hyperthyroidism,,No,0
1956,1779848,25,Male,Postmenopausal,No,African American,Normal,Low,Insufficient,Sedentary,Yes,,Rheumatoid Arthritis,Corticosteroids,Yes,0


In [5]:
# Check for outliers in the 'Age' column using a simple statistical method
Q1 = osteoporosis_data['Age'].quantile(0.25)
Q3 = osteoporosis_data['Age'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = osteoporosis_data[(osteoporosis_data['Age'] < lower_bound) | (osteoporosis_data['Age'] > upper_bound)]
print("Outliers in 'Age':")
print(outliers)

Outliers in 'Age':
Empty DataFrame
Columns: [Id, Age, Gender, Hormonal Changes, Family History, Race/Ethnicity, Body Weight, Calcium Intake, Vitamin D Intake, Physical Activity, Smoking, Alcohol Consumption, Medical Conditions, Medications, Prior Fractures, Osteoporosis]
Index: []


In [6]:
# Examine unique values in each categorical column to identify inconsistencies
categorical_columns = osteoporosis_data.columns[osteoporosis_data.dtypes == 'object']
for column in categorical_columns:
    print(f"Unique values in {column}:")
    print(osteoporosis_data[column].unique())
    print("\n")

Unique values in Gender:
['Female' 'Male']


Unique values in Hormonal Changes:
['Normal' 'Postmenopausal']


Unique values in Family History:
['Yes' 'No']


Unique values in Race/Ethnicity:
['Asian' 'Caucasian' 'African American']


Unique values in Body Weight:
['Underweight' 'Normal']


Unique values in Calcium Intake:
['Low' 'Adequate']


Unique values in Vitamin D Intake:
['Sufficient' 'Insufficient']


Unique values in Physical Activity:
['Sedentary' 'Active']


Unique values in Smoking:
['Yes' 'No']


Unique values in Alcohol Consumption:
['Moderate' 'None']


Unique values in Medical Conditions:
['Rheumatoid Arthritis' 'None' 'Hyperthyroidism']


Unique values in Medications:
['Corticosteroids' 'None']


Unique values in Prior Fractures:
['Yes' 'No']




In [7]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd
from IPython.display import display

# Define custom mappings for the binary columns
binary_mappings = {
    'Hormonal Changes': {'Postmenopausal':1, 'Normal':0},
    'Family History': {'Yes': 1, 'No': 0},
    'Smoking': {'Yes': 1, 'No': 0},
    'Alcohol Consumption': {'Moderate': 1, 'None': 0},
    'Medications': {'Corticosteroids': 1, 'None': 0},
    'Prior Fractures': {'Yes': 1, 'No': 0},
    'Body Weight': {'Underweight': 0, 'Normal': 1},
    'Vitamin D Intake': {'Sufficient': 1, 'Insufficient': 0},
    'Calcium Intake': {'Adequate': 1, 'Low': 0},
    'Physical Activity': {'Active': 1, 'Sedentary': 0},
}

# Apply the mappings to the dataframe
for col, mapping in binary_mappings.items():
    osteoporosis_data[col] = osteoporosis_data[col].map(mapping)

# One-hot encode 'Gender' and 'Race/Ethnicity' columns
one_hot_columns = ['Gender', 'Race/Ethnicity', 'Medical Conditions']
one_hot_encoder = OneHotEncoder()
one_hot_encoded = one_hot_encoder.fit_transform(osteoporosis_data[one_hot_columns]).toarray()

# Convert the one-hot encoded matrix back to a DataFrame with appropriate column names
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_columns))

# Drop the original non-binary categorical columns from the dataset and concatenate the one-hot encoded ones
osteoporosis_data = osteoporosis_data.drop(columns=one_hot_columns)
osteoporosis_data = pd.concat([osteoporosis_data, one_hot_encoded_df], axis=1)

# Normalize the 'Age' column using Min-Max scaling
min_max_scaler = MinMaxScaler()
osteoporosis_data['Age'] = min_max_scaler.fit_transform(osteoporosis_data[['Age']])

# Display the transformed DataFrame
display(osteoporosis_data)




Unnamed: 0,Id,Age,Hormonal Changes,Family History,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,...,Prior Fractures,Osteoporosis,Gender_Female,Gender_Male,Race/Ethnicity_African American,Race/Ethnicity_Asian,Race/Ethnicity_Caucasian,Medical Conditions_Hyperthyroidism,Medical Conditions_None,Medical Conditions_Rheumatoid Arthritis
0,1734616,0.708333,0,1,0,0,1,0,1,1,...,1,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1419098,0.194444,0,1,0,0,1,0,0,0,...,1,1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1797916,0.986111,1,0,1,1,1,1,0,1,...,0,1,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1805337,0.833333,0,0,0,1,0,0,1,0,...,0,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1351334,0.277778,1,1,1,0,1,1,1,0,...,1,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1953,1991635,0.013889,0,1,1,1,1,0,1,1,...,1,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1954,1528601,0.069444,1,1,0,0,0,1,0,0,...,0,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1955,1990957,0.222222,1,0,0,0,1,0,0,0,...,0,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1956,1779848,0.097222,1,0,1,0,0,0,1,0,...,1,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0


In [8]:
# Export the DataFrame to a new CSV file
# osteoporosis_data.to_csv('transformed_osteoporosis.csv', index=False)

# Display basic information about the dataset
print(osteoporosis_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1958 entries, 0 to 1957
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Id                                       1958 non-null   int64  
 1   Age                                      1958 non-null   float64
 2   Hormonal Changes                         1958 non-null   int64  
 3   Family History                           1958 non-null   int64  
 4   Body Weight                              1958 non-null   int64  
 5   Calcium Intake                           1958 non-null   int64  
 6   Vitamin D Intake                         1958 non-null   int64  
 7   Physical Activity                        1958 non-null   int64  
 8   Smoking                                  1958 non-null   int64  
 9   Alcohol Consumption                      1958 non-null   int64  
 10  Medications                              1958 no