Title: Train a Linear Regression Model

Task 1: Predicting House Prices<br>
Dataset: Use a dataset that contains various features of houses such as square footage, number of bedrooms, and age of the house.

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import kagglehub

# Download the California Housing Prices dataset from Kaggle
path = kagglehub.dataset_download("camnugent/california-housing-prices")
print("Path to dataset files:", path)

# Load the dataset (replace 'housing.csv' with the actual file name in the downloaded dataset)
file_path = f"{path}/housing.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset Preview:")
print(df.head())

# Define features (X) and target (y)
# Selecting relevant features for house price prediction
X = df[["median_income", "housing_median_age", "total_rooms"]]
y = df["median_house_value"]

# Feature scaling (optional but recommended for better performance)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Display the model coefficients
print("\nModel Coefficients:")
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/camnugent/california-housing-prices?dataset_version_number=1...


100%|██████████| 400k/400k [00:01<00:00, 345kB/s]

Extracting files...
Path to dataset files: /home/vscode/.cache/kagglehub/datasets/camnugent/california-housing-prices/versions/1
Dataset Preview:
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       




Task 2: Predicting Car Mileage (MPG)<br>
Dataset: Use a dataset where features include horsepower, weight, and model year of cars.

In [8]:
# Import necessary libraries
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Fetch the Auto MPG dataset from the UCI Machine Learning Repository
auto_mpg = fetch_ucirepo(id=9)

# Extract features and target from the dataset
X = auto_mpg.data.features  # Features (e.g., Horsepower, Weight, Model Year)
y = auto_mpg.data.targets   # Target (MPG)

# Display metadata and variable information
print("Dataset Metadata:")
print(auto_mpg.metadata)
print("\nVariable Information:")
print(auto_mpg.variables)

# Convert features and target to Pandas DataFrame for easier manipulation
X = pd.DataFrame(X, columns=auto_mpg.variables['features'])  # Correctly access column names
y = pd.Series(y, name="MPG")

# Select relevant features for prediction
X = X[["horsepower", "weight", "model_year"]]

# Handle missing values (if any)
X = X.dropna()
y = y[X.index]  # Ensure target matches the filtered features

# Feature scaling (optional but recommended for better performance)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Display the model coefficients
print("\nModel Coefficients:")
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Dataset Metadata:
{'uci_id': 9, 'name': 'Auto MPG', 'repository_url': 'https://archive.ics.uci.edu/dataset/9/auto+mpg', 'data_url': 'https://archive.ics.uci.edu/static/public/9/data.csv', 'abstract': 'Revised from CMU StatLib library, data concerns city-cycle fuel consumption', 'area': 'Other', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 398, 'num_features': 7, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': [], 'target_col': ['mpg'], 'index_col': ['car_name'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1993, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5859H', 'creators': ['R. Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This dataset is a slightly modified version of the dataset provided in the StatLib library.  In line with the use by Ross Quinlan (1993) in predicting the attribute "mpg", 8 of the original instances were removed because they had unk

KeyError: 'features'

Task 3: Predicting Student's Scores'
Dataset: Use a dataset that contains study hours and corresponding student test scores.

Dataset Metadata:
{'uci_id': 9, 'name': 'Auto MPG', 'repository_url': 'https://archive.ics.uci.edu/dataset/9/auto+mpg', 'data_url': 'https://archive.ics.uci.edu/static/public/9/data.csv', 'abstract': 'Revised from CMU StatLib library, data concerns city-cycle fuel consumption', 'area': 'Other', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 398, 'num_features': 7, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': [], 'target_col': ['mpg'], 'index_col': ['car_name'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1993, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5859H', 'creators': ['R. Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This dataset is a slightly modified version of the dataset provided in the StatLib library.  In line with the use by Ross Quinlan (1993) in predicting the attribute "mpg", 8 of the original instances were removed because they had unk

KeyError: 'features'