In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics import mutual_info_score

data = pd.read_csv("C:\Data\data.homework_3.csv")

data.columns = data.columns.str.replace(' ', '_').str.lower()
data.fillna(0, inplace=True)
data.rename(columns={'msrp': 'price'}, inplace=True)


Question 1: What is the most frequent observation (mode) for the column transmission_type?

In [2]:
most_frequent_transmission_type = data['transmission_type'].mode()[0]
print("Question 1 Answer:", most_frequent_transmission_type)

Question 1 Answer: AUTOMATIC


Question 2: What are the two features that have the biggest correlation in this dataset?

In [3]:
numerical_features = ['engine_hp', 'year', 'engine_cylinders', 'highway_mpg', 'city_mpg']
correlation_matrix = data[numerical_features].corr()
correlation_pair = correlation_matrix.unstack().sort_values(ascending=False).drop_duplicates()
print("Question 2 Answer:", correlation_pair[1:3].index.tolist())

data['above_average'] = (data['price'] > data['price'].mean()).astype(int)

X = data.drop(['price', 'above_average'], axis=1)
y = data['above_average']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Question 2 Answer: [('city_mpg', 'highway_mpg'), ('engine_cylinders', 'engine_hp')]


Question 3: Which of these variables has the lowest mutual information score?

In [4]:
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']
mi_scores = {}
for feature in categorical_features:
    mi_score = mutual_info_score(y_train, X_train[feature])
    mi_scores[feature] = round(mi_score, 2)
lowest_mi_feature = min(mi_scores, key=mi_scores.get)
print("Question 3 Answer:", lowest_mi_feature)

Question 3 Answer: transmission_type


Question 4: What accuracy did you get?

In [5]:
vectorizer = DictVectorizer(sparse=False)
X_train_encoded = vectorizer.fit_transform(X_train[categorical_features].to_dict(orient='records'))
X_val_encoded = vectorizer.transform(X_val[categorical_features].to_dict(orient='records'))

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)
y_pred = model.predict(X_val_encoded)
accuracy = round(accuracy_score(y_val, y_pred), 2)
print("Question 4 Answer:", accuracy)

Question 4 Answer: 0.94


Question 5: Which of the following features has the smallest difference?

In [6]:
# Define the list of categorical and numerical features
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']
numerical_features = ['year', 'engine_hp', 'city_mpg', 'highway_mpg']

# Combine categorical and numerical features
features = categorical_features + numerical_features

# Calculate the original accuracy score
original_score = accuracy

for feature_to_exclude in features:
    # Create a copy of the features list and exclude one feature at a time
    subset = features.copy()
    subset.remove(feature_to_exclude)

    # Prepare the training data
    train_dict = X_train[subset].to_dict(orient='records')

    # Initialize and fit the DictVectorizer
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    # Transform the training data
    X_train_temp = dv.transform(train_dict)

    # Initialize and train the logistic regression model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_temp, y_train)

    # Prepare the validation data
    val_dict = X_val[subset].to_dict(orient='records')
    X_val_temp = dv.transform(val_dict)

    # Make predictions on the validation data
    y_pred = model.predict(X_val_temp)

    # Calculate the accuracy score
    score = accuracy_score(y_val, y_pred)

    # Print the feature, the difference in accuracy, and the accuracy score
    print(feature_to_exclude, original_score - score, score)


make 0.010079731430969296 0.9299202685690307
model 0.020990348300461537 0.9190096516995384
transmission_type 0.010919009651699452 0.9290809903483005
vehicle_style 0.004204783885858099 0.9357952161141418
year -0.004607637431808698 0.9446076374318086
engine_hp 0.017633235417540916 0.922366764582459
city_mpg 0.005463701216953387 0.9345362987830466
highway_mpg 0.005463701216953387 0.9345362987830466


Question 6: Which of these alphas leads to the best RMSE on the validation set?

In [7]:
X_train_encoded = vectorizer.fit_transform(X_train[categorical_features].to_dict(orient='records'))
X_val_encoded = vectorizer.transform(X_val[categorical_features].to_dict(orient='records'))

alphas = [0, 0.01, 0.1, 1, 10]
best_alpha = None
best_rmse = float('inf')

for alpha in alphas:
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_val_encoded)
    rmse = round(np.sqrt(mean_squared_error(y_val, y_pred)), 3)

    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha

print("Question 6 Answer:", best_alpha)

Question 6 Answer: 0
