In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

def load_dataset(file_path, sheet_name=None):
    """Load dataset from Excel file."""
    return pd.read_excel(file_path, sheet_name=sheet_name)

def segregate_data(data):
    # Segregate the data into matrices A and C.
    A = np.matrix(data[["Candies (#)", "Mangoes (Kg)", "Milk Packets (#)"]])
    C = np.matrix(data[["Payment (Rs)"]])
    return A, C

def calculate_dimensions(A, C):
    # Print dimensions of matrices A and C.
    return np.shape(A), np.shape(C)


def calculate_rank(A):
    # Calculate and return the rank of matrix A.
    return np.linalg.matrix_rank(A)

def calculate_pseudo_inverse(A):
    # Calculate the pseudo-inverse of matrix A
    return np.linalg.pinv(A)

def calculate_model_vector(x_pseudo_inv, C):
    # Calculate the model vector X using pseudo-inverse.
    return np.dot(x_pseudo_inv, C)

def RichPoorClassifier(data):
    # Classify customers as 'RICH' or 'POOR'.
    classes = []
    for cost in data["Payment (Rs)"]:
        if cost > 200:
            classes.append("RICH")
        else:
            classes.append("POOR")
    return classes

def preprocess_stock_data(df):
    """Preprocess stock data by setting index and cleaning percentage column."""
    df["Chg%"] = df["Chg%"].astype(str).str.replace("%", "").astype(float)
    return df

def add_class_to_data(data, classes):
    # Add the 'Class' column to the DataFrame.
    data["Class"] = classes
    return data

def calculate_mean(df, column):
    """Calculate mean of a numerical column."""
    return statistics.mean(df[column].dropna())

def calculate_variance(df, column):
    """Calculate variance of a numerical column."""
    return statistics.variance(df[column].dropna())

def mean_price_on_wednesday(df):
    """Calculate mean price on Wednesdays."""
    price_on_wed = df[df["Day"] == "Wed"]["Price"]
    return statistics.mean(price_on_wed)

def mean_price_in_month(df, month):
    """Calculate mean price for a specific month."""
    price_month = df[df["Month"] == month]["Price"]
    return statistics.mean(price_month)

def probability_of_loss(df):
    """Calculate probability of making a loss."""
    neg_chg = df[df["Chg%"] < 0]
    return len(neg_chg) / len(df)

def probability_of_profit_on_wednesday(df):
    """Calculate probability of making a profit on a Wednesday."""
    Wednesdays = df[df["Day"] == "Wed"]
    if len(Wednesdays) == 0:
        return 0
    profitable_Wednesdays = Wednesdays[Wednesdays["Chg%"] > 0]
    return len(profitable_Wednesdays) / len(Wednesdays)

def plot_chg_vs_day(df):
    """Plot Change% vs Day of the week."""
    plt.figure(figsize=(8, 5))
    plt.scatter(df["Day"], df["Chg%"], color="blue", alpha=0.6)
    plt.title("Chg% vs Day of the Week")
    plt.xlabel("Day of the Week")
    plt.ylabel("Chg%")
    plt.show()

def identify_columns(data):
    """Identify categorical and numerical columns."""
    categorical_cols = [col for col in data.columns if data[col].dtype == 'object']
    numerical_cols = [col for col in data.columns if data[col].dtype != 'object']
    return categorical_cols, numerical_cols

def encode_categorical_data(data, categorical_cols):
    """Encode categorical columns using Label Encoding or One-Hot Encoding."""
    encoded_data = data.copy()
    label_encoders = {}
    one_hot_columns = []

    for col in categorical_cols:
        unique_values = data[col].unique()
        if len(unique_values) <= 5:
            le = LabelEncoder()
            encoded_data[col] = le.fit_transform(data[col])
            label_encoders[col] = le
        else:
            one_hot_columns.append(col)

    encoded_data = pd.get_dummies(encoded_data, columns=one_hot_columns)
    return encoded_data, label_encoders

def check_missing_values(data):
    """Check for missing values in the dataset."""
    return {col: data[col].isnull().sum() for col in data.columns if data[col].isnull().sum() > 0}

def normalize_data(data, numerical_cols):
    """Normalize numerical columns using Min-Max Scaling."""
    scaler = MinMaxScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
    return data

#A1
df1 = load_dataset("/Lab Session02 Data.xlsx", "Purchase Data")
# Segregate data into A (features) and C (target variable)
A, C = segregate_data(data)

# Print dimensions of matrices A and C
dimA, dimC = calculate_dimensions(A, C)
print(f"Dimensionality  of A: {dimA}")
print(f"Dimensions of C: {dimC}")

# Calculate the rank of matrix A
rank = calculate_rank(A)
print(f"Rank of matrix A: {rank}")

#A2
# Calculate the pseudo-inverse of A
pseudo_inv = calculate_pseudo_inverse(A)
print(f"Pseudo-inverse of A: {pseudo_inv}")

# Calculate the model vector X using pseudo-inverse
X = calculate_model_vector(pseudo_inv, C)
print(f"Model vector X: {X}")

#A3
# Classify customers as 'RICH' or 'POOR'
classes = RichPoorClassifier(data)

# Add the classification to the data
data = add_class_to_data(data, classes)

# Display the final dataframe with class labels
print(data[["Customer", "Candies (#)", "Mangoes (Kg)", "Milk Packets (#)", "Payment (Rs)", "Class"]])

#A4
df2 = load_dataset("/Lab Session02 Data.xlsx", "IRCTC Stock Price")
# Statistical calculations
print("Mean of price:", calculate_mean(df2, 'Price'))
print("Variance of price:", calculate_variance(df2, 'Price'))
print("Mean price on Wednesdays:", mean_price_on_wednesday(df2))
print("Mean price in April:", mean_price_in_month(df2, "Apr"))
print("Probability of making a loss:", probability_of_loss(df2))
print("Probability of making a profit on Wednesday:", probability_of_profit_on_wednesday(df2))

# Plot
plot_chg_vs_day(df2)

#A5
df3 = load_dataset("/Lab Session02 Data.xlsx", "thyroid0387_UCI")
#Datatypes: Record ID- ,age-ordinal, sex- nominal,on thyroxine- nominal,query on thyroxine, on antithyroid medication - nominal, sick- nominal, pregnant- nominal, thyroid surgery-nominal,I131 treatment-nominal, query hypothyroid-nominal, TSH-Ordinal
categorical_cols, numerical_cols = identify_columns(data)
# Encode categorical data
encoded_data, label_encoders = encode_categorical_data(df3, categorical_cols)
print("Data after encoding:")
print(encoded_data.head())

# Check for missing values
missing_values = check_missing_values(encoded_data)
if missing_values:
    print("Missing values:")
    for col, count in missing_values.items():
        print(f"{col}: {count} missing values")
else:
    print("No missing values")

# Calculate ranges for numerical columns
ranges = calculate_ranges(data, numerical_cols)
print("Ranges for numerical columns:")
for col, col_range in ranges.items():
    print(f"{col}: {col_range}")

# Detect outliers using boxplots
outliers = detect_outliers(encoded_data, numerical_cols)
if outliers:
    print("Outliers:")
    for col, outlier_values in outliers.items():
        print(f"{col}: {outlier_values}")
else:
    print("No outliers detected.")

# Calculate mean and variance for numerical columns
stats = calculate_mean_variance(encoded_data, numerical_cols)
print("Mean and variance of numerical columns:")
for col, values in stats.items():
    print(f"{col}: Mean = {values['mean']}, Variance = {values['variance']}")

#A6
# Impute missing values
encoded_data = impute_missing_values(encoded_data)
print("Data after imputation:")
print(encoded_data.head())

#A7
# Normalize numerical columns
data = normalize_data(encoded_data, numerical_cols)
print("Data after normalization:")
print(data.head())

#A8
# Similarity Measure (Jaccard Coefficient and SMC)
# Selecting the first two observation vectors with binary attributes
binary_cols = [col for col in encoded_data.columns if encoded_data[col].nunique() == 2]
vec1 = encoded_data.iloc[0][binary_cols].values
vec2 = encoded_data.iloc[1][binary_cols].values

jc, smc = calculate_jaccard_smc(vec1, vec2)
print(f"Jaccard Coefficient (JC): {jc}")
print(f"Simple Matching Coefficient (SMC): {smc}")

#A9
# Cosine Similarity Measure
# Select the first two observation vectors with all attributes
vec1_all = encoded_data.iloc[0].values
vec2_all = encoded_data.iloc[1].values
cosine_sim = calculate_cosine_similarity(vector1_all, vector2_all)
print(f"Cosine Similarity: {cosine_sim}")

#A10
# Heatmap Plot
plot_similarity_heatmaps(encoded_data)


FileNotFoundError: [Errno 2] No such file or directory: '/Lab Session02 Data.xlsx'