In [None]:

import numpy as np
import pandas as pd

# Define dataset size
num_samples = 500
outlier_fraction = 0.2  # 20% outliers

# Set random seed for reproducibility
np.random.seed(42)

# Generate house features
area = np.random.randint(500, 5000, num_samples)
bedrooms = np.random.randint(1, 6, num_samples)
bathrooms = np.random.randint(1, 5, num_samples)
floors = np.random.randint(1, 4, num_samples)
year_built = np.random.randint(1900, 2023, num_samples)
location = np.random.choice(['Downtown', 'Suburban', 'Rural'], num_samples)
condition = np.random.choice(['Excellent', 'Good', 'Fair', 'Poor'], num_samples)
garage = np.random.choice(['Yes', 'No'], num_samples)

# Base price calculation (strong correlation with area + bedrooms + condition)
base_price = (5000 * area) + (20000 * bedrooms) + (15000 * bathrooms) + (10000 * floors)

# Adjust price based on location and condition
location_adjustment = np.where(location == 'Downtown', 1.2, np.where(location == 'Suburban', 1.0, 0.8))
condition_adjustment = np.where(condition == 'Excellent', 1.3, np.where(condition == 'Good', 1.1, np.where(condition == 'Fair', 0.9, 0.7)))
garage_adjustment = np.where(garage == 'Yes', 1.1, 1.0)

# Final price with noise and adjustments
price = base_price * location_adjustment * condition_adjustment * garage_adjustment

# Add 20% outliers by modifying price drastically
num_outliers = int(outlier_fraction * num_samples)
outlier_indices = np.random.choice(num_samples, num_outliers, replace=False)
price[outlier_indices] *= np.random.uniform(1.5, 3.0, num_outliers)  # Increase price drastically for outliers

# Create a DataFrame
df = pd.DataFrame({
    'Id': np.arange(1, num_samples + 1),
    'Area': area,
    'Bedrooms': bedrooms,
    'Bathrooms': bathrooms,
    'Floors': floors,
    'YearBuilt': year_built,
    'Location': location,
    'Condition': condition,
    'Garage': garage,
    'Price': price.astype(int)
})

# Save DataFrame to CSV
csv_file_path = "house_price_dataset_with_outliers.csv"
df.to_csv(csv_file_path, index=False)

print(f"✅ Dataset generated and saved as '{csv_file_path}' successfully!")
print(df.head())


In [None]:
# Simple Linear Regression
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("/content/house_price_dataset.csv")
print(df.head())

x = df[['Area']]
y = df['Price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

new_df = pd.DataFrame([[1500]], columns=['Area'])
predicted_price = model.predict(new_df)
print(f"Predicted Price for Area 1562: {predicted_price[0]}")

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.2f}")

plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.scatter(x_train, y_train, label='Training Data')
plt.plot(x_train, model.predict(x_train), color='red', label='Regression Line')
plt.title('Training Data')
plt.xlabel('Area')
plt.ylabel('Price')
plt.legend()

plt.subplot(1, 3, 2)
plt.scatter(x_test, y_test, label='Test Data')
plt.plot(x_test, y_pred, color='red', label='Predictions')
plt.title('Test Data')
plt.xlabel('Area')
plt.ylabel('Price')
plt.legend()

plt.subplot(1, 3, 3)
plt.scatter(x_test, y_test, label='Test Data')
plt.plot(x_test, y_pred, color="green", label='Predictions')
plt.scatter(new_df, predicted_price, color="red", label='New Prediction')
plt.title('New Prediction')
plt.xlabel('Area')
plt.ylabel('Price')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Multiple Linear Regression
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("/content/house_price_dataset.csv")
print(df.head())

x = df[['YearBuilt', 'Floors', 'Area']].values
y = df['Price'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

new = [[1970, 2, 1562]]
predicted_price = model.predict(new)
print(f"Predicted Price for New Data: {predicted_price[0]:.2f}")

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

n = len(y_test)  # Number of samples
p = x_test.shape[1]  # Number of predictors (features)
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))
print(f"Adjusted R² Score: {adjusted_r2:.4f}")

plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.scatter(y_train, model.predict(x_train), color='blue')
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], color='red', linewidth=2)
plt.title('Training Data: Actual vs Predicted')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')

plt.subplot(1, 3, 2)
plt.scatter(y_test, y_pred, color='green')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.title('Test Data: Actual vs Predicted')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')

plt.subplot(1, 3, 3)
plt.scatter(y_test, y_pred, color='orange', label='Test Data')
plt.scatter(predicted_price, predicted_price, color='red', label=f'New Pred: {predicted_price[0]:.2f}')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.title('Prediction for New Area')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Polynomial Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("/content/position_salaries.csv")
print(df)

x = df[['Level']].values
y = df['Salary'].values

poly = PolynomialFeatures(degree=4)
x_poly = poly.fit_transform(x)

poly_model = LinearRegression()
poly_model.fit(x_poly, y)

y_pred_poly = poly_model.predict(x_poly)

mse_poly = mean_squared_error(y, y_pred_poly)
r2_poly = r2_score(y, y_pred_poly)
n, p = x_poly.shape  # n = number of data points, p = number of features (including intercept)
adjusted_r2_poly = 1 - (1 - r2_poly) * (n - 1) / (n - p)

print(f"Mean Squared Error (MSE) for Polynomial Regression: {mse_poly:.2f}")
print(f"R² Score for Polynomial Regression: {r2_poly:.4f}")
print(f"Adjusted R² Score for Polynomial Regression: {adjusted_r2_poly:.4f}")


x_grid = np.arange(min(x), max(x), 0.1).reshape(-1, 1)
plt.scatter(x, y, color='blue', label='Actual Data')
plt.plot(x_grid, poly_model.predict(poly.transform(x_grid)), color='green', label='Polynomial Prediction')
plt.title('Polynomial Regression (Degree 4)')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.legend()
plt.show()


In [None]:
# Decision Tree Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('/content/position_salaries.csv')

X = df[['Level']].values
y = df['Salary'].values

model = DecisionTreeRegressor(random_state=42)
model.fit(X, y)

plt.figure(figsize=(10, 6))
plot_tree(model, feature_names=['Level'], filled=True, rounded=True)
plt.title('Decision Tree Visualization for Position Salaries')
plt.show()

new_level = [[6.5]]
predicted_salary = model.predict(new_level)
print(f"Predicted Salary for Level 6.5: {predicted_salary[0]:.2f}")

y_pred = model.predict(X)

mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
n, p = X.shape
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.4f}")
print(f"Adjusted R² Score: {adjusted_r2:.4f}")

plt.figure(figsize=(8, 6))
plt.scatter(X, y, color='blue', label='Actual Data')
plt.plot(X, y_pred, color='green', label='Decision Tree Prediction', linewidth=2)
plt.scatter(new_level, predicted_salary, color='red', label=f'Predicted Salary: {predicted_salary[0]:.2f}')
plt.title('Decision Tree Regression')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.legend()
plt.show()


In [None]:
# Support Vector Regression (SVR)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('/content/position_salaries.csv')
print(df)

X = df[['Level']].values
y = df['Salary'].values

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).ravel()

model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
model.fit(X_scaled, y_scaled)

new_level_scaled = scaler_X.transform([[6.5]])
predicted_salary_scaled = model.predict(new_level_scaled)
predicted_salary = scaler_y.inverse_transform(predicted_salary_scaled.reshape(-1, 1))[0][0]

print(f"Predicted Salary for Level 6.5: {predicted_salary:.2f}")

y_pred_scaled = model.predict(X_scaled)
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

mse = mean_squared_error(y, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

r2 = r2_score(y, y_pred)
print(f"R² Score: {r2:.4f}")

n, p = X.shape
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(f"Adjusted R² Score: {adjusted_r2:.4f}")

plt.scatter(X, y, color='red', label='Actual Data')
plt.plot(X, y_pred, color='blue', label='SVR Predictions')
plt.title('Support Vector Regression (SVR) - Position Salaries')
plt.xlabel('Level')
plt.ylabel('Salary')
plt.legend()
plt.show()


In [None]:
#Set B
#Q1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
dataset = pd.read_csv("/home/fymsc8/MLDatasets/StudentHoursScores.csv")

X = dataset['Hours'].values.reshape(-1, 1)
y = dataset['Scores'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

plt.scatter(X_test, y_test, color='blue')
plt.plot(X_test, y_pred, color='red')
plt.title('Student Scores vs Study Hours')
plt.xlabel('Study Hours')
plt.ylabel('Scores')
plt.show()

In [None]:
# Set B
# Q2 - Multiple Linear Regression on 50_Startups Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

dataset = pd.read_csv('/home/fymsc8/MLDatasets/50_Startups.csv')

X = dataset[['Administration', 'State', 'Marketing Spend', 'R&D Spend']]
y = dataset['Profit']

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

n, p = X.shape
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(f"Adjusted R² Score: {adjusted_r2:.4f}")

plt.figure(figsize=(10, 6))

plt.scatter(y_test, y_pred, color='blue', label='Predicted vs Actual')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Perfect Fit')

plt.title('Actual vs Predicted Profit')
plt.xlabel('Actual Profit')
plt.ylabel('Predicted Profit')
plt.legend()
plt.show()


In [None]:
# Q3 - 1
# Simple Linear Regression for Position Salaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('/content/position_salaries.csv')

x = df[['Level']].values
y = df['Salary'].values

model = LinearRegression()
model.fit(x, y)

y_pred = model.predict(x)

mse = mean_squared_error(y, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

r2 = r2_score(y, y_pred)
print(f"R² Score: {r2:.4f}")

n, p = x.shape
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(f"Adjusted R² Score: {adjusted_r2:.4f}")

plt.figure(figsize=(10, 6))
plt.scatter(x, y, color='blue', label='Actual Data')
plt.plot(x, y_pred, color='red', label='Regression Line')
plt.title('Simple Linear Regression Fit')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.legend()
plt.show()


In [None]:
# Q2
# Polynomial Regression for Position Salaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('/content/position_salaries.csv')

x = df[['Level']].values
y = df['Salary'].values

poly_reg = PolynomialFeatures(degree=4)
x_poly = poly_reg.fit_transform(x)

model_poly = LinearRegression()
model_poly.fit(x_poly, y)

y_pred_poly = model_poly.predict(x_poly)

mse = mean_squared_error(y, y_pred_poly)
print(f"Mean Squared Error (MSE): {mse:.2f}")

r2 = r2_score(y, y_pred_poly)
print(f"R² Score: {r2:.4f}")

n, p = x_poly.shape
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(f"Adjusted R² Score: {adjusted_r2:.4f}")

plt.figure(figsize=(10, 6))
plt.scatter(x, y, color='blue', label='Actual Data')
plt.plot(x, y_pred_poly, color='red', label='Polynomial Regression Line')
plt.title('Polynomial Regression Fit (Degree 4)')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.legend()
plt.show()
