In [1]:
import pandas as pd
import numpy as np

# Read the dataset
auto_mg = pd.read_csv("auto-mpg.tsv", sep="\t")

# Drop null values
auto_mg = auto_mg.dropna()

auto_mg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,-1.0,8.0,304.0,193.0,4732.0,18.5,70.0,1.0,hi 1200d
1,-1.0,8.0,307.0,200.0,4376.0,15.0,70.0,1.0,chevy c20
2,-1.0,8.0,360.0,215.0,4615.0,14.0,70.0,1.0,ford f250
3,-1.0,8.0,318.0,210.0,4382.0,13.5,70.0,1.0,dodge d200
4,-1.0,8.0,350.0,180.0,3664.0,11.0,73.0,1.0,oldsmobile omega


# 1.
The following are standardized to 'displacement', 'horsepower', 'weight', 'acceleration'. This is for the following reason: These columns have a large number of unique values and have a great range of which the values are spread over. Standardizing data makes sure that every part of the data has a fair impact, helps the model learn faster, and assists in making the model more stable and accurate.

The following have one-hot method applied on them 'cylinders', 'origin', 'model_year'. This if for the following reason: These columns have very less unique values and more of catagorical variables and the values itself have little significance. One-hot encoding turns categories into numbers, creating a new column for each category. This way, the model can work better with categorical information.



In [2]:
numeric_features = ['displacement', 'horsepower', 'weight', 'acceleration']
categorical_features = ['cylinders', 'origin', 'model_year']

# Preprocess numerical features (standardization)
for feature in numeric_features:
    mean_value = auto_mg[feature].mean()
    std_dev = auto_mg[feature].std()
    auto_mg[feature] = (auto_mg[feature] - mean_value) / std_dev

# Preprocess categorical features (one-hot encoding)
for feature in categorical_features:
    unique_values = auto_mg[feature].unique()
    for value in unique_values:
        new_feature_name = f"{feature}_{value}"
        auto_mg[new_feature_name] = (auto_mg[feature] == value).astype(int)


In [3]:
auto_mg["car_name"].unique()

array(['hi 1200d', 'chevy c20', 'ford f250', 'dodge d200',
       'oldsmobile omega', 'chevrolet impala', 'mercury marquis',
       'oldsmobile delta 88 royale', 'oldsmobile vista cruiser',
       'dodge monaco (sw)', 'ford country', 'mercury marquis brougham',
       'buick electra 225 custom', 'ford mustang ii', 'ford f108',
       'ford gran torino (sw)', 'chevrolet chevelle concours (sw)',
       'dodge d100', 'plymouth volare premier v8', 'chevrolet malibu',
       'chevy c10', 'buick century luxus (sw)', 'buick lesabre custom',
       'buick century 350', 'ford ltd', 'plymouth custom suburb',
       'amc ambassador brougham', 'chevrolet caprice classic',
       'ford country squire (sw)', 'pontiac safari (sw)',
       'chrysler newport royal', 'chrysler new yorker brougham',
       'ford gran torino', 'amc matador', 'amc matador (sw)',
       'plymouth satellite custom (sw)', 'plymouth fury iii',
       'plymouth fury gran sedan', 'dodge coronet custom (sw)',
       "plymouth 'cu

# 2.
As we can see, there is a great number of unique values in the car_name column.

For logistic regression, we can use Hashing method. Hash encoding can be useful when dealing with a large number of unique categories, as it avoids creating a large number of new features.

Logistic regression effectively incorporates hash-encoded categorical variables, representing each category with a fixed-size hash. The algorithm leverages these encoded features to make predictions with interpretable coefficients, facilitating binary classification tasks.



In [4]:
# Using Python's hash function
auto_mg['car_name_hashed'] = auto_mg['car_name'].apply(hash)

# 3.

For decision tree modelling, we can use Label Encoding method. Label Encoding is often a good choice for decision trees. It preserves ordinal relationships. Decision trees can effectively split and branch based on integer-encoded categories.

Label Encoding is beneficial for decision trees as it converts categorical variables into integer labels. Decision trees naturally handle such encoded features, preserving ordinal relationships and facilitating effective branching for classification tasks.




In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
auto_mg['car_name_encoded'] = label_encoder.fit_transform(auto_mg['car_name'])

For decision tree, Label Encoding is applied on car_name to make the information useful.

# 4.
As we see below, the correlation is very less for car_name_encoded as well as car_name_hashed. So, intuitively it doesn't add much value to the model

In [10]:
import pandas as pd

df = auto_mg  

# Check the structure of the dataset
#print(df.info())

# Descriptive statistics
#print(df.describe())

# Correlation matrix
#correlation_matrix = df.corr()
#print(correlation_matrix)

# Correlation with the target variable (mpg)
correlation_with_mpg = df.corr()['mpg'].sort_values(ascending=False)
print(correlation_with_mpg)


mpg                 1.000000
cylinders_4.0       0.811320
horsepower          0.588761
origin              0.513698
model_year          0.429904
origin_3.0          0.387922
acceleration        0.346822
displacement        0.333174
origin_2.0          0.269484
model_year_82.0     0.268685
car_name_encoded    0.263121
model_year_80.0     0.251832
model_year_81.0     0.217918
model_year_74.0     0.041005
car_name_hashed     0.032237
cylinders_5.0       0.029273
model_year_79.0     0.029239
model_year_77.0    -0.019811
model_year_71.0    -0.050366
cylinders_3.0      -0.050767
model_year_78.0    -0.053000
model_year_76.0    -0.054384
model_year_75.0    -0.076767
model_year_72.0    -0.118864
model_year_70.0    -0.146197
model_year_73.0    -0.235970
cylinders_6.0      -0.380901
origin_1.0         -0.532206
cylinders_8.0      -0.562217
weight             -0.757757
cylinders          -0.759194
Name: mpg, dtype: float64


# 5.
Based on choices made above, the following is a feature matrix

In [7]:
# Separate features and target variable
X = auto_mg.drop(['mpg','car_name'], axis=1).copy()
y = auto_mg['mpg'].copy()

# Shuffle the dataset
np.random.seed(42)
shuffle_indices = np.random.permutation(X.index)
X = X.loc[shuffle_indices].reset_index(drop=True)
y = y.loc[shuffle_indices].reset_index(drop=True)

# Display the preprocessed feature matrix
print("Preprocessed Feature Matrix:")
print(X)

Preprocessed Feature Matrix:
     cylinders  displacement  horsepower    weight  acceleration  model_year  \
0          6.0     -0.431000   -1.195391  0.770442      0.891190        74.0   
1          4.0     -0.841010    1.318988 -0.491621     -0.486188        78.0   
2          4.0     -0.768266    1.229188 -0.462189     -0.377448        82.0   
3          6.0     -0.457453    0.630527  0.534983      1.978595        75.0   
4          4.0      1.691792   -0.087867 -0.756513      2.957258        80.0   
..         ...           ...         ...       ...           ...         ...   
387        8.0     -0.126800   -1.015793  1.397942     -1.501098        77.0   
388        6.0     -0.520277   -1.030759  0.550288     -0.776162        78.0   
389        4.0     -0.890608    1.378854 -0.797719     -0.304954        78.0   
390        4.0     -0.926980    0.570660 -0.862470      0.492476        82.0   
391        8.0     -0.275593   -1.090626  1.062412     -1.102384        77.0   

     origi

# 6.
This is a 10 fold cross validation method code to train and evaluate a decision tree classifier on this data(without using sklearn or other libraries). sklearn.tree.DecisionTreeClassifier is used to build the tree for each fold.  

In [8]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# Function to split the data into k folds
def kfold_split(X, y, k):
    fold_size = len(X) // k
    folds_X = [X[i * fold_size:(i + 1) * fold_size] for i in range(k)]
    folds_y = [y[i * fold_size:(i + 1) * fold_size] for i in range(k)]
    return folds_X, folds_y

# Function to calculate accuracy manually
def calculate_accuracy(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    accuracy = correct / total
    return accuracy

# Function to perform k-fold cross-validation
def cross_validate(X, y, k):
    folds_X, folds_y = kfold_split(X, y, k)
    scores = []

    for i in range(k):
        # Use the i-th fold as the validation set, rest as training set
        X_train = np.concatenate([fold for j, fold in enumerate(folds_X) if j != i])
        y_train = np.concatenate([fold for j, fold in enumerate(folds_y) if j != i])
        X_val = folds_X[i]
        y_val = folds_y[i]

        # Train decision tree on training set using DecisionTreeClassifier
        tree = DecisionTreeClassifier()
        tree.fit(X_train, y_train)

        # Evaluate on validation set
        y_pred = tree.predict(X_val)

        # Calculate accuracy manually
        accuracy = calculate_accuracy(y_val, y_pred)
        scores.append(accuracy)

    # Calculate the mean accuracy across all folds
    mean_accuracy = np.mean(scores)
    return mean_accuracy

# Example usage:
# Assuming X and y are your preprocessed feature matrix and target variable
# k is the number of folds, and criterion is one of 'gini', 'entropy', or 'mse'
k = 10
mean_accuracy = cross_validate(X, y, k)
print(f'Mean Accuracy : {mean_accuracy}')


Mean Accuracy : 0.8743589743589745


# 7. 
Improving accuracy using the following parameters: 
tree_params = {
    'max_depth': 5,
    'min_samples_split': 10,
    'min_samples_leaf': 5,
    'max_features': 'sqrt',  
    'criterion': 'gini' 
}


In [9]:
tree_params = {
    'max_depth': 10,
    'min_samples_split': 10,
    'min_samples_leaf': 8,
    'max_features': 'sqrt',  
    'criterion': 'gini' }
def cross_validate(X, y, k):
    folds_X, folds_y = kfold_split(X, y, k)
    scores = []

    for i in range(k):
        # Use the i-th fold as the validation set, rest as training set
        X_train = np.concatenate([fold for j, fold in enumerate(folds_X) if j != i])
        y_train = np.concatenate([fold for j, fold in enumerate(folds_y) if j != i])
        X_val = folds_X[i]
        y_val = folds_y[i]

        # Train decision tree on training set using DecisionTreeClassifier
        tree = DecisionTreeClassifier(**tree_params)
        tree.fit(X_train, y_train)

        # Evaluate on validation set
        y_pred = tree.predict(X_val)

        # Calculate accuracy manually
        accuracy = calculate_accuracy(y_val, y_pred)
        scores.append(accuracy)

    # Calculate the mean accuracy across all folds
    mean_accuracy = np.mean(scores)
    return mean_accuracy

# Example usage:
# Assuming X and y are your preprocessed feature matrix and target variable
# k is the number of folds, and criterion is one of 'gini', 'entropy', or 'mse'
k = 10
mean_accuracy = cross_validate(X, y, k)
print(f'Mean Accuracy : {mean_accuracy}')


Mean Accuracy : 0.9
