In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error

In [26]:
# Load your CSV data
data = pd.read_csv('https://raw.githubusercontent.com/msyturk/classification-and-regression/main/cars.csv')

# Drop rows with missing values in the label column
data = data.dropna(subset=[data.columns[-1]])

# Assuming the last column is the output and the rest are inputs
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [29]:
# Identify categorical columns
categorical_columns = ['Origin']

# Identify numerical columns
numerical_columns = list(set(X.columns) - set(categorical_columns))

# Create transformers for numerical and categorical columns
numerical_transformer = SimpleImputer(strategy='median')  # Median imputation for numerical columns
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply transformations separately
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [31]:
# Method 1: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_transformed, y_train)
lr_pred = lr_model.predict(X_test_transformed)
lr_r2 = r2_score(y_test, lr_pred)
lr_mae = mean_absolute_error(y_test, lr_pred)

In [32]:
# Method 2: Random Forest Regressor
rf_model = RandomForestRegressor()
rf_model.fit(X_train_transformed, y_train)
rf_pred = rf_model.predict(X_test_transformed)
rf_r2 = r2_score(y_test, rf_pred)
rf_mae = mean_absolute_error(y_test, rf_pred)

In [33]:
# Method 3: Support Vector Regressor (SVR)
svr_model = SVR()
svr_model.fit(X_train_transformed, y_train)
svr_pred = svr_model.predict(X_test_transformed)
svr_r2 = r2_score(y_test, svr_pred)
svr_mae = mean_absolute_error(y_test, svr_pred)

In [35]:
# Compare the methods
results = pd.DataFrame({
    'Method': ['Linear Regression', 'Random Forest Regressor', 'SVR'],
    'R-squared': [lr_r2, rf_r2, svr_r2],
    'MAE': [lr_mae, rf_mae, svr_mae]
})

# Print or save the results
results

Unnamed: 0,Method,R-squared,MAE
0,Linear Regression,0.848707,2.30139
1,Random Forest Regressor,0.909676,1.584575
2,SVR,0.743259,2.785389
