<a href="https://colab.research.google.com/github/michaeledge27/CSCI290/blob/main/notebooks/partnerProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
url = "https://github.com/michaeledge27/CSCI290/raw/refs/heads/main/data/MBA.csv"
MBAdf = pd.read_csv(url)

In [3]:
colsInUse = ['gpa', 'major', 'gmat']
mbadf = MBAdf[colsInUse]
mbadf.head()

Unnamed: 0,gpa,major,gmat
0,3.3,Business,620.0
1,3.28,Humanities,680.0
2,3.3,Business,710.0
3,3.47,STEM,690.0
4,3.35,STEM,590.0


In [4]:
mbadf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6194 entries, 0 to 6193
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   gpa     6194 non-null   float64
 1   major   6194 non-null   object 
 2   gmat    6194 non-null   float64
dtypes: float64(2), object(1)
memory usage: 145.3+ KB


In [5]:
colsInUseNotTarget = ['gpa', 'major']

# Attribute Selection Method

In [6]:
import numpy as np
import math

In [7]:
# Calculate the entropy for a categorical feature
def categorical_entropy(df, target, feature):
    entropyList = []  # List to store entropy values
    uniqueVals = df[feature].unique()  # Get unique values for the feature
    overall = len(df)  # Total number of rows
    for val in uniqueVals:
        entropy = 0  # Initialize entropy for each unique value
        subset_size = len(df[df[feature] == val])  # Size of subset for the current value
        weight = subset_size / overall  # Weight of the subset
        props = df[df[feature] == val][target].value_counts(normalize=True)  # Get the proportion of target values
        for p in props:
          entropy -= weight * (p * math.log2(p))  # Calculate entropy for the current value
          entropyList.append(entropy)  # Append entropy value to the list
    return min(entropyList)  # Return the minimum entropy value

In [8]:
# Calculate the Gini index for a categorical feature
def categorical_gini(df, target, feature):
    giniList = []  # List to store Gini values
    uniqueVals = df[feature].unique()  # Get unique values for the feature
    overall = len(df)  # Total number of rows
    for val in uniqueVals:
        subset_size = len(df[df[feature] == val])  # Size of subset for the current value
        weight = subset_size / overall  # Weight of the subset
        props = df[df[feature] == val][target].value_counts(normalize=True)  # Get the proportion of target values
        gini = 1 - np.sum(np.square(props))  # Calculate Gini for the current value
        giniList.append(weight * gini)  # Append weighted Gini value to the list
    return min(giniList)  # Return the minimum Gini value

In [9]:
# Calculate the entropy for a quantitative feature
def quantitative_entropy(df, target, feature):
    entropyList = []  # List to store entropy values
    splitPoints = []  # List to store split points
    uniqueVals = np.sort(df[feature].unique())  # Sorted unique values for the feature
    overall = len(df)  # Total number of rows
    for val in uniqueVals:
        entropy = 0  # Initialize entropy for each unique value
        # Split the data into two subsets: <= val and > val
        left = df[df[feature] <= val][[feature, target]]
        right = df[df[feature] > val][[feature, target]]

        # Calculate the entropy for the left subset
        props_left = left[target].value_counts(normalize=True)
        weight_left = len(left) / overall
        for prop in props_left:
            if prop > 0:
                entropy -= weight_left * prop * math.log2(prop)

        # Calculate the entropy for the right subset
        props_right = right[target].value_counts(normalize=True)
        weight_right = len(right) / overall
        for prop in props_right:
            if prop > 0:
                entropy -= weight_right * prop * math.log2(prop)

        entropyList.append(entropy)  # Append entropy value to the list
        splitPoints.append(val)  # Append split point to the list

    min_entropy = np.min(entropyList)  # Get the minimum entropy value
    best_split = splitPoints[np.argmin(entropyList)]  # Find the corresponding split point
    return min_entropy, best_split


In [10]:
# Calculate the Gini index for a quantitative feature
def quantitative_gini(df, target, feature):
    giniList = []  # List to store Gini values
    splitPoints = []  # List to store split points
    uniqueVals = np.sort(df[feature].unique())  # Sorted unique values for the feature
    overall = len(df)  # Total number of rows
    for val in uniqueVals:
        gini = 0  # Initialize Gini index for each unique value
        # Split the data into two subsets: <= val and > val
        left = df[df[feature] <= val][[feature, target]]
        right = df[df[feature] > val][[feature, target]]

        # Calculate the Gini for the left subset
        props_left = left[target].value_counts(normalize=True)
        weight_left = len(left) / overall
        gini_left = 1 - np.sum(np.square(props_left))

        # Calculate the Gini for the right subset
        props_right = right[target].value_counts(normalize=True)
        weight_right = len(right) / overall
        gini_right = 1 - np.sum(np.square(props_right))

        # Combine Gini values and append to the list
        gini = weight_left * gini_left + weight_right * gini_right
        giniList.append(gini)
        splitPoints.append(val)

    min_gini = np.min(giniList)  # Get the minimum Gini value
    best_split = splitPoints[np.argmin(giniList)]  # Find the corresponding split point
    return min_gini, best_split

In [11]:
# Helper function to check if a feature is categorical
def isCategorical(df, feature):
    return df[feature].nunique() < 8  # A feature is considered categorical if it has fewer than 8 unique values


In [12]:
# Main attribute selection method function
def attribute_selection_method(df, target, measure):
    categoricalList = []  # Store results for categorical features
    categoricalFeatureList = []  # Store categorical feature names
    quantitativeList = []  # Store results for quantitative features
    quantitativeFeatureList = []  # Store quantitative feature names
    splitList = []  # Store best split points for quantitative features
    features = df.columns.drop(target)  # Get all features except the target

    for feature in features:
        if isCategorical(df, feature):  # If feature is categorical
            if measure == 'entropy':
                result = categorical_entropy(df, target, feature)
            elif measure == 'gini':
                result = categorical_gini(df, target, feature)
            categoricalList.append(result)
            categoricalFeatureList.append(feature)
        else:  # If feature is quantitative
            if measure == 'entropy':
                min_entropy, best_split = quantitative_entropy(df, target, feature)
                quantitativeList.append(min_entropy)
                splitList.append(best_split)
            elif measure == 'gini':
                min_gini, best_split = quantitative_gini(df, target, feature)
                quantitativeList.append(min_gini)
                splitList.append(best_split)
            quantitativeFeatureList.append(feature)

    # Determine the best feature and split point
    if categoricalList and min(categoricalList) <= min(quantitativeList):
        return categoricalFeatureList[np.argmin(categoricalList)]
    else:
        best_index = np.argmin(quantitativeList)
        return quantitativeFeatureList[best_index], splitList[best_index]

# Mean Squared Error

In [13]:
# Mean Squared Error for quantitative features for regression
def regression_mse_quantitative(df, target, feature):
    mseList = []
    splitPoints = []
    uniqueVals = np.sort(df[feature].unique())
    overall = len(df)

    for val in uniqueVals:
        # Split into left and right subsets
        left = df[df[feature] <= val][target]
        right = df[df[feature] > val][target]

        # Calculate MSE for left and right subsets
        if len(left) > 0:
            mse_left = np.mean((left - left.mean()) ** 2)
        else:
            mse_left = 0

        if len(right) > 0:
            mse_right = np.mean((right - right.mean()) ** 2)
        else:
            mse_right = 0

        # Weighted average of the MSEs
        weight_left = len(left) / overall
        weight_right = len(right) / overall
        mse = weight_left * mse_left + weight_right * mse_right

        mseList.append(mse)
        splitPoints.append(val)

    #return the minimum mean squared errror value as well as the split at the matching index
    min_mse = np.min(mseList)
    best_split = splitPoints[np.argmin(mseList)]
    return min_mse, best_split

In [14]:
# Mean Squared Error for regression with categorical features
def regression_mse_categorical(df, target, feature):
    mseList = []
    splitPoints = []
    uniqueVals = df[feature].unique()
    overall = len(df)

    for val in uniqueVals:
        # Split into left (matching category) and right (not matching category) subsets
        left = df[df[feature] == val][target]
        right = df[df[feature] != val][target]

        # Calculate MSE for left and right subsets
        if len(left) > 0:
            mse_left = np.mean((left - left.mean()) ** 2)
        else:
            mse_left = 0

        if len(right) > 0:
            mse_right = np.mean((right - right.mean()) ** 2)
        else:
            mse_right = 0

        # Weighted average of the MSEs
        weight_left = len(left) / overall
        weight_right = len(right) / overall
        mse = weight_left * mse_left + weight_right * mse_right

        mseList.append(mse)
        splitPoints.append(val)

    # Return the minimum MSE value and the best category split
    min_mse = np.min(mseList)
    best_split = splitPoints[np.argmin(mseList)]
    return min_mse, best_split

In [15]:
regression_mse_quantitative(mbadf, 'gmat', 'gpa')

(1882.2081312947516, 3.25)

In [16]:
regression_mse_quantitative(mbadf, 'gmat', 'major')

(2429.3693132952044, 'Business')

In [17]:
regression_mse_categorical(mbadf, 'gmat', 'major')

(2429.2303680295686, 'Humanities')

# SciKit Learn Stuff

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
target = mbadf['gmat']

In [None]:
train_set, test_set = train_test_split(mbadf, test_size=0.2)

In [None]:
X_train = train_set[colsInUseNotTarget]
y_train = train_set['gmat']
X_test = test_set[colsInUseNotTarget]
y_test = test_set['gmat']

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.tree import plot_tree

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4955 entries, 5227 to 532
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   gpa     4955 non-null   float64
 1   major   4955 non-null   object 
dtypes: float64(1), object(1)
memory usage: 116.1+ KB


In [None]:
num_attributes = X_train.select_dtypes(include = ['float64']).columns
cat_attributes = X_train.select_dtypes(include = ['object']).columns

In [None]:
trf = [
       ('cat', OneHotEncoder( handle_unknown='ignore'), cat_attributes) ]
col_transform = ColumnTransformer( transformers = trf )

In [None]:
pipeline = Pipeline( steps = [('pre', col_transform),
 ('clf', DecisionTreeRegressor(max_depth=3))])

In [None]:
cv_scores = cross_val_score( pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_scores

array([-1584.80974861, -1682.05069525, -1722.59650572, -1639.29508188,
       -1650.42258339])

In [None]:
mse_scores = -cv_scores
mse_scores

array([1584.80974861, 1682.05069525, 1722.59650572, 1639.29508188,
       1650.42258339])

In [None]:
model = DecisionTreeRegressor(max_depth=3)

In [None]:
encoded = col_transform.fit_transform(train_set['major'].to_frame())

In [None]:
fittedModel = model.fit(encoded, train_set['gmat'])

In [None]:
predictions = fittedModel.predict(X_test)



ValueError: could not convert string to float: 'Business'