# Importing Libraries


In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Defining Functions


In [46]:
def load_data(file_path, sheet_name):
    """Load data from an Excel file and replace '?' with NaN."""
    data = pd.read_excel(file_path, sheet_name=sheet_name, na_values='?')
    return data

def check_missing_values(data):
    """Check for missing values in each attribute and print the number of missing values."""
    missing_values = data.isna().sum()
    return missing_values

def study_data_types(data):
    """Study data types and associated values for each attribute."""
    data_types = data.dtypes
    classified_types = {}
    for col, dtype in data_types.items():
        if pd.api.types.is_numeric_dtype(dtype):
            classified_types[col] = 'Numeric'
        else:
            classified_types[col] = 'Nominal'
    return classified_types

def identify_encoding_schemes(data):
    """Identify encoding schemes for categorical attributes."""
    categorical_cols = data.select_dtypes(include=['object']).columns
    nominal_cols = []
    ordinal_cols = []

    for col in categorical_cols:
        unique_vals = data[col].nunique()
        if unique_vals <= 5:  # Assuming ordinal columns have 10 or fewer unique values
            ordinal_cols.append(col)
        else:
            nominal_cols.append(col)
    return nominal_cols, ordinal_cols

def study_numeric_data_range(data):
    """Study the data range for numeric variables."""
    numeric_data = data.select_dtypes(include=[np.number])
    data_range = numeric_data.agg(['min', 'max'])
    return data_range

def detect_outliers(data):
    """Detect outliers in numeric data."""
    numeric_data = data.select_dtypes(include=[np.number])
    outliers = {}
    for col in numeric_data.columns:
        Q1 = numeric_data[col].quantile(0.25)
        Q3 = numeric_data[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers[col] = ((numeric_data[col] < (Q1 - 1.5 * IQR)) | (numeric_data[col] > (Q3 + 1.5 * IQR))).sum()
    return outliers

def calculate_mean_variance(data):
    """Calculate mean and variance (or standard deviation) for numeric variables."""
    numeric_data = data.select_dtypes(include=[np.number])
    mean_values = numeric_data.mean()
    variance_values = numeric_data.var()
    return mean_values, variance_values

# Loading Database


In [44]:
file_path = '/content/Lab Session Data (1).xlsx'
sheet_name = 'thyroid0387_UCI'

# Load data
data = load_data(file_path, sheet_name)

# Processing Data and Printing Results



In [47]:
data_types = study_data_types(data)
print("Data types and associated values:")
for col, dtype in data_types.items():
    if dtype == 'Numeric':
        print(f"{col}: Categorical")
    else:
        print(f"{col}: Nominal")

nominal_cols, ordinal_cols = identify_encoding_schemes(data)
print("\nEncoding Schemes:")
print(f"Nominal columns: {nominal_cols}")
print(f"Ordinal columns: {ordinal_cols}")

# Study numeric data range
data_range = study_numeric_data_range(data)
print("\nData range for numeric variables:")
print(data_range)

# Check for missing values
missing_values = check_missing_values(data)
print("\nMissing values in each attribute:")
print(missing_values)

# Detect outliers
outliers = detect_outliers(data)
print("\nNumber of outliers detected in each numeric attribute:")
print(outliers)

# Calculate mean and variance (or standard deviation) for numeric variables
mean_values, variance_values = calculate_mean_variance(data)
print("\nMean values of numeric attributes:")
print(mean_values)
print("\nVariance values of numeric attributes:")
print(variance_values)

Data types and associated values:
Record ID: Categorical
age: Categorical
sex: Nominal
on thyroxine: Nominal
query on thyroxine: Nominal
on antithyroid medication: Nominal
sick: Nominal
pregnant: Nominal
thyroid surgery: Nominal
I131 treatment: Nominal
query hypothyroid: Nominal
query hyperthyroid: Nominal
lithium: Nominal
goitre: Nominal
tumor: Nominal
hypopituitary: Nominal
psych: Nominal
TSH measured: Nominal
TSH: Categorical
T3 measured: Nominal
T3: Categorical
TT4 measured: Nominal
TT4: Categorical
T4U measured: Nominal
T4U: Categorical
FTI measured: Nominal
FTI: Categorical
TBG measured: Nominal
TBG: Categorical
referral source: Nominal
Condition: Nominal

Encoding Schemes:
Nominal columns: ['referral source', 'Condition']
Ordinal columns: ['sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured