In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = r'C:\Users\NeilXu\Downloads\ToyotaCorolla.csv'  
data = pd.read_csv(file_path)

# Selecting relevant columns
features = ['Age_08_04', 'KM', 'HP']
X = data[features]
y = data['Price']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and training the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Outputting the results
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Outputting the coefficients
coefficients = pd.DataFrame(model.coef_, features, columns=['Coefficient'])
print(coefficients)

Mean Squared Error: 2530780.7798339296
R-squared: 0.8103259513824548
           Coefficient
Age_08_04  -153.696216
KM           -0.011313
HP           34.671966


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = r'C:\Users\NeilXu\Downloads\ToyotaCorolla.csv'  # Updated path
data = pd.read_csv(file_path)

# Selecting relevant columns (initially including more features)
features = ['Age_08_04', 'KM', 'HP', 'CC', 'Doors', 'Gears', 'Quarterly_Tax', 'Weight']
X = data[features]
y = data['Price']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and training the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Outputting the results
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Outputting the coefficients
coefficients = pd.DataFrame(model.coef_, features, columns=['Coefficient'])

# Identifying the top 4 drivers for price
top_4_drivers = coefficients.reindex(coefficients['Coefficient'].abs().sort_values(ascending=False).index).head(4)

print("Top 4 drivers for price:")
print(top_4_drivers)

Mean Squared Error: 1950244.987790427
R-squared: 0.8538352805672252
Top 4 drivers for price:
           Coefficient
Gears       566.402069
Age_08_04  -119.692339
HP           28.521533
Weight       19.363926


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = r'C:\Users\NeilXu\Downloads\ToyotaCorolla.csv'  # Updated path
data = pd.read_csv(file_path)

# Dropping non-numeric columns for simplicity and the ID column
data = data.select_dtypes(include=[float, int]).drop(columns=['Id'])

# Removing the target column from the feature set
X = data.drop(columns=['Price'])
y = data['Price']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the linear regression model
model = LinearRegression()

# Using RFECV to find the best combination of features
rfecv = RFECV(estimator=model, step=1, scoring='r2', cv=5)
rfecv = rfecv.fit(X_train, y_train)

# Making predictions on the test set
y_pred = rfecv.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Outputting the results
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Outputting the selected features
selected_features = X.columns[rfecv.support_]
print("Selected features:")
print(selected_features)

# Outputting the coefficients of the selected features
coefficients = pd.DataFrame(rfecv.estimator_.coef_, selected_features, columns=['Coefficient'])
print(coefficients)

Mean Squared Error: 1662254.357930122
R-squared: 0.8754192712331831
Selected features:
Index(['Age_08_04', 'Mfg_Month', 'Mfg_Year', 'HP', 'Met_Color', 'Automatic',
       'Doors', 'Gears', 'Weight', 'Mfr_Guarantee', 'BOVAG_Guarantee',
       'Guarantee_Period', 'ABS', 'Airbag_1', 'Airbag_2', 'Airco',
       'Automatic_airco', 'Boardcomputer', 'CD_Player', 'Central_Lock',
       'Powered_Windows', 'Power_Steering', 'Radio', 'Mistlamps',
       'Sport_Model', 'Backseat_Divider', 'Metallic_Rim', 'Radio_cassette',
       'Parking_Assistant', 'Tow_Bar'],
      dtype='object')
                   Coefficient
Age_08_04          -143.120848
Mfg_Month          -109.759185
Mfg_Year             21.073336
HP                   28.274750
Met_Color            32.817364
Automatic           617.162007
Doors                68.937271
Gears               131.761640
Weight               10.827038
Mfr_Guarantee       342.551186
BOVAG_Guarantee     562.181216
Guarantee_Period     78.904168
ABS                

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = r'C:\Users\NeilXu\Downloads\ToyotaCorolla.csv'  # Updated path
data = pd.read_csv(file_path)

# Dropping non-numeric columns for simplicity and the ID column
data = data.select_dtypes(include=[float, int]).drop(columns=['Id'])

# Removing the target column from the feature set
X = data.drop(columns=['Price'])
y = data['Price']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the linear regression model
model = LinearRegression()

# Using RFECV to find the best combination of features
rfecv = RFECV(estimator=model, step=1, scoring='r2', cv=5)
rfecv = rfecv.fit(X_train, y_train)

# Extracting the ranking of features
ranking = pd.Series(rfecv.ranking_, index=X.columns).sort_values()
top_5_features = ranking[ranking == 1].index[:5]

# Building a new model with the top 5 features
X_train_top5 = X_train[top_5_features]
X_test_top5 = X_test[top_5_features]

model_top5 = LinearRegression()
model_top5.fit(X_train_top5, y_train)

# Making predictions on the test set
y_pred_top5 = model_top5.predict(X_test_top5)

# Evaluating the new model
mse_top5 = mean_squared_error(y_test, y_pred_top5)
r2_top5 = r2_score(y_test, y_pred_top5)

# Outputting the results
print(f"Mean Squared Error with top 5 features: {mse_top5}")
print(f"R-squared with top 5 features: {r2_top5}")

# Outputting the selected top 5 features and their coefficients
coefficients_top5 = pd.DataFrame(model_top5.coef_, top_5_features, columns=['Coefficient'])
print("Top 5 features and their coefficients:")
print(coefficients_top5)

Mean Squared Error with top 5 features: 3075474.0685138763
R-squared with top 5 features: 0.7695029049368791
Top 5 features and their coefficients:
                  Coefficient
Age_08_04         -168.269847
Radio_cassette    -335.352506
Metallic_Rim       655.747233
Backseat_Divider  -334.469328
Sport_Model        534.911408


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = r'C:\Users\NeilXu\Downloads\ToyotaCorolla.csv'  # Updated path
data = pd.read_csv(file_path)

# Dropping non-numeric columns for simplicity and the ID column
data = data.select_dtypes(include=[float, int]).drop(columns=['Id'])

# Removing the target column from the feature set
X = data.drop(columns=['Price'])
y = data['Price']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the linear regression model
model = LinearRegression()

# Using RFECV to find the best combination of features
rfecv = RFECV(estimator=model, step=1, scoring='r2', cv=5)
rfecv = rfecv.fit(X_train, y_train)

# Extracting the ranking of features
ranking = pd.Series(rfecv.ranking_, index=X.columns).sort_values()
top_5_features = ranking[ranking == 1].index[:5]

# Building a new model with the top 5 features
X_train_top5 = X_train[top_5_features]
X_test_top5 = X_test[top_5_features]

model_top5 = LinearRegression()
model_top5.fit(X_train_top5, y_train)

# Making predictions on the test set
y_pred_top5 = model_top5.predict(X_test_top5)

# Evaluating the new model
mse_top5 = mean_squared_error(y_test, y_pred_top5)
r2_top5 = r2_score(y_test, y_pred_top5)

# Outputting the results
print(f"Mean Squared Error with top 5 features: {mse_top5}")
print(f"R-squared with top 5 features: {r2_top5}")

# Outputting the selected top 5 features and their coefficients
coefficients_top5 = pd.DataFrame(model_top5.coef_, top_5_features, columns=['Coefficient'])
print("Top 5 features and their coefficients:")
print(coefficients_top5)

Mean Squared Error with top 5 features: 3075474.0685138763
R-squared with top 5 features: 0.7695029049368791
Top 5 features and their coefficients:
                  Coefficient
Age_08_04         -168.269847
Radio_cassette    -335.352506
Metallic_Rim       655.747233
Backseat_Divider  -334.469328
Sport_Model        534.911408
