In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset directly from the UCI repository
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
df = pd.read_csv(data_url, header=None)

# Assign column names (based on the dataset documentation)
df.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'evaluation']

# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Display dataset information
print("Dataset Info:\n", df.info())

# Display the first few rows of the dataset
print("First 5 rows of the dataset:\n", df.head())

# Convert all text columns to lowercase and strip any extra spaces
df.columns = df.columns.str.lower().str.strip()

# Clean the text columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.lower().str.strip()

print("Cleaned dataset:\n", df.head())

# Convert categorical columns to one-hot encoded numeric format
df = pd.get_dummies(df, columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'], drop_first=True)

# For the target variable, let's assume 'evaluation' is to be predicted and is categorical
# We'll convert 'evaluation' to a numerical format for regression purposes
evaluation_mapping = {
    'unacceptable': 0,
    'acceptable': 1,
    'good': 2,
    'very good': 3
}
df['evaluation'] = df['evaluation'].map(evaluation_mapping)

# Check for any NaNs in the evaluation column after mapping
if df['evaluation'].isnull().any():
    print("Warning: There are NaNs in the evaluation column after mapping.")
    print("Unique values in 'evaluation':", df['evaluation'].unique())

# Drop rows with NaN values in the target column if any exist
df.dropna(subset=['evaluation'], inplace=True)

# Check again for missing values
print("Missing Values after cleaning:\n", df.isnull().sum())

# Separate features (X) and target variable (y)
X = df.drop(columns=['evaluation'])  # Features
y = df['evaluation']  # Target

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Missing Values:
 buying        0
maint         0
doors         0
persons       0
lug_boot      0
safety        0
evaluation    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   buying      1728 non-null   object
 1   maint       1728 non-null   object
 2   doors       1728 non-null   object
 3   persons     1728 non-null   object
 4   lug_boot    1728 non-null   object
 5   safety      1728 non-null   object
 6   evaluation  1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB
Dataset Info:
 None
First 5 rows of the dataset:
   buying  maint doors persons lug_boot safety evaluation
0  vhigh  vhigh     2       2    small    low      unacc
1  vhigh  vhigh     2       2    small    med      unacc
2  vhigh  vhigh     2       2    small   high      unacc
3  vhigh  vhigh     2       2      med    low      unacc
4  vhigh  vhigh 