# Linear Regression Project & Classification Tree Homework: Part C - Classification

Discretize the last column "Chance of Admit" into three classes and
create a classification tree with training data


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import os

# Path to the CSV file containing the data to be used for this part of the assignment
path_to_data_file_csv: str = os.path.normpath(
    r".\data\part_b_linear_regression_data\admission_predict_ver1.1_small_data_set_for_linear_regression.csv"
)

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(path_to_data_file_csv)

# Discretize the "Chance of Admit" column into three classes
df['Chance of Admit'] = pd.cut(
    df['Chance of Admit'],  # Column to discretize (continuous)
    bins=3,  # Number of bins to use for discretization
    labels=['Low', 'Medium', 'High']  # Labels for the three classes
)

# Split the data into training and testing sets
x_train: pd.DataFrame # Training data (features) (80% of the data)
x_test: pd.DataFrame # Testing data (features) (20% of the data)
y_train: pd.Series # Training data (target) (80% of the data)
y_test: pd.Series # Testing data (target) (20% of the data)
x_train, x_test, y_train, y_test = train_test_split(
    # Drop the target column from the training data (axis=1 means column)
    df.drop('Chance of Admit', axis=1),
    # Target column (Chance of Admit) for training data
    df['Chance of Admit'],
    # 20% of the data will be used for testing (80% for training)
    test_size=0.2,
    # Random seed for reproducibility of results (42 is arbitrary)
    random_state=42
)

# Create a DecisionTreeClassifier object
clf: DecisionTreeClassifier = DecisionTreeClassifier()

# Fit the classifier to the training data
clf.fit(x_train, y_train)  # No need to scale the data for decision trees

# Predict the classes of the testing data
y_pred: list = clf.predict(x_test)

# Evaluate the performance of the classifier
accuracy: float = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)

Accuracy: 0.76


Test the tree with test data and evaluate the results in Python.
