# Part 1: Decision Trees with Categorical Attributes

This part uses the adult data set (https://archive.ics.uci.edu/ml/datasets/Adult) from the UCI Machine Learning Repository to predict whether the income of an individual exceeds 50K per year based on 14 attributes. For this part, the attribute fnlwgt is dropped and the following attributes are taken into consideration:

    age: age group
    workclass: type of employment
    education: level of education reached
    education-num: number of education years
    marital-status: type of maritals status
    occupation: occupation domain
    relationship: type of relationship involved
    race: social category
    sex: male or female
    capital-gain: class of capital gains
    capital-loss: class of capital losses
    hours-per-week: category of working hours
    native-country: country of birth

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import sklearn.tree as tree

# Return a pandas dataframe containing the data set that needs to be extracted from the data_file.
# data_file will be populated with the string 'adult.csv'.
def read_csv_1(data_file):
    # read csv file using Pandas
    df = pd.read_csv(data_file)
    del df['fnlwgt']
    return df

# Return the number of rows in the pandas dataframe df.
def num_rows(df):
    return df.shape[0]

# Return a list with the column names in the pandas dataframe df.
def column_names(df):
    return df.columns.values.tolist()

# Return the number of missing values in the pandas dataframe df.
def missing_values(df):
    return df.isnull().sum().sum()

# Return a list with the columns names containing at least one missing value in the pandas dataframe df.
def columns_with_missing_values(df):
    return df.columns[df.isnull().any()].tolist()

# Return the percentage of instances corresponding to persons whose education level is 
# Bachelors or Masters, by rounding to the third decimal digit,
# in the pandas dataframe df containing the data set in the adult.csv file.
# For example, if the percentage is 0.21547%, then the function should return 0.216.
def bachelors_masters_percentage(df):
    bachelors = df.education.value_counts().Bachelors
    masters = df.education.value_counts().Masters

    bachelors_and_masters = bachelors + masters
    per_bachelors_and_masters = round((bachelors_and_masters / df.education.value_counts().sum() * 100), 3)
    
    return per_bachelors_and_masters

# Return a pandas dataframe (new copy) obtained from the pandas dataframe df 
# by removing all instances with at least one missing value.
def data_frame_without_missing_values(df):
    df.dropna(inplace=True)
    return df

# Return a pandas dataframe (new copy) from the pandas dataframe df 
# by converting the df categorical attributes to numeric using one-hot encoding.
# The function should not encode the target attribute, and the function's output
# should not contain the target attribute.
def one_hot_encoding(df):
    new_columns = column_names(df).copy()
    new_columns.remove('class')

    label_encoder = LabelEncoder()
    df[new_columns] = df[new_columns].apply(lambda x: label_encoder.fit_transform(x))
    
    onehotencoder = OneHotEncoder(sparse_output=False)
    onehotencoder.fit_transform(df[new_columns])
    
    return df[new_columns]

# Return a pandas series (new copy), from the pandas dataframe df, 
# containing only one column with the labels of the df instances
# converted to numeric using label encoding. 
def label_encoding(df):
    new_df = df.copy()
    
    label_encoder = LabelEncoder()
    new_df['class'] = label_encoder.fit_transform(df['class'])
    
    return new_df['class']

# Given a training set X_train containing the input attribute values 
# and labels y_train for the training instances,
# build a decision tree and use it to predict labels for X_train. 
# Return a pandas series with the predicted values. 
def dt_predict(X_train,y_train):
    # initialise the decision tree
    clf = tree.DecisionTreeClassifier(random_state = 0)

    # fit the tree model to the training data
    clf.fit(X_train, y_train)

    # predict the labels for the test set
    y_hat = clf.predict(X_train)
    
    return pd.Series(y_hat)

# Given a pandas series y_pred with the predicted labels and a pandas series y_true with the true labels,
# compute the error rate of the classifier that produced y_pred.  
def dt_error_rate(y_pred, y_true):
    # count the number of correctly predicted labels
    count = 0
    for i in range(len(y_true)):
        if y_pred[i] == y_true[i]:
            count += 1
    score = (count / len(y_true))

#     print('number of correct predictions = {} out of {} = {}'.format(count, num_of_datas, score))
    return 1 - score
#     print('training_error =', training_error)

In [2]:
# Running all functions
# feel free to comment out lines you don't want to run

df = read_csv_1('data/adult.csv')
print("Number of rows:", num_rows(df))
print("Columns:", column_names(df))
print("Number of missing values:", missing_values(df))
print("Columns with missing values:", columns_with_missing_values(df))
print("Percentage of people with bachelors or masters degree:", bachelors_masters_percentage(df), "%")
# print("Number of rows without missing values:", len(data_frame_without_missing_values(df)))
x_train = one_hot_encoding(df)
# print(x_train)
y_train = label_encoding(df)
# print(y_train)
y_pred = dt_predict(x_train, y_train)
# print(y_pred)
print("Error rate of classifier:", dt_error_rate(y_pred, y_train))

Number of rows: 48842
Columns: ['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capitalgain', 'capitalloss', 'hoursperweek', 'native-country', 'class']
Number of missing values: 6465
Columns with missing values: ['workclass', 'occupation', 'native-country']
Percentage of people with bachelors or masters degree: 21.871 %
Error rate of classifier: 0.0790098685557512
