In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

# ***EXPLOR THE DATA SET***

In [None]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Advertising.csv')
data.head()

In [None]:
data.tail()

In [None]:
# statistical summary of data
num_sum = data.describe()
palette = sns.color_palette('cividis', as_cmap=True)
num_sum.style.background_gradient(cmap=palette)

In [None]:
# shape of data
data.shape

# ***DATA CLEANING***

In [None]:
# Check for missing values
print(data.isnull().sum())

# Visualize the correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# info of data
data.info()

# ***DUPLICATES IDENTIFY***

In [None]:
# duplicate values
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows = {duplicates}")

# drop duplicates
print("After dropping duplicates")
data.drop_duplicates(inplace=True)
print(f"Number of duplicate rows = {data.duplicated().sum()}")

In [None]:
# Features and target variable
X = data[['TV', 'Radio', 'Newspaper']]
y = data['Sales']

# ***DATA PRE-PROCESSING***

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate skewness
skewness = data['Sales'].skew()
print(f'Skewness of Sales: {skewness}')

plt.figure(figsize=(10, 6))
sns.histplot(data['Sales'], bins=30, kde=True)
plt.title('Distribution of Sales')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.pairplot(data, x_vars=['TV', 'Radio', 'Newspaper'], y_vars='Sales', height=5, aspect=0.7)
plt.suptitle('Advertising Budget vs Sales', y=1.02)
plt.show()

F

In [None]:
print(data.dtypes)

# ***FEATURE ENGINEERING***

In [None]:
# Example: Create a feature for total advertising budget
data['Total_Ad_Budget'] = data['TV'] + data['Radio'] + data['Newspaper']

In [None]:
import numpy as np

data['Log_Sales'] = np.log(data['Sales'] + 1)  # Adding 1 to avoid log(0)

# **Encoding Categorical Variables**

In [None]:
# Example: If there were a categorical variable 'Category'
# data = pd.get_dummies(data, columns=['Category'], drop_first=True)

# ***FEATURE SCALING***

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[['TV', 'Radio', 'Newspaper', 'Total_Ad_Budget']] = scaler.fit_transform(data[['TV', 'Radio', 'Newspaper', 'Total_Ad_Budget']])

# ***SPLIT THE DATA SET***

In [None]:
from sklearn.model_selection import train_test_split

X = data[['TV', 'Radio', 'Newspaper', 'Total_Ad_Budget']]
y = data['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ***Model Training & Model Evaluation***

In [None]:
# Train Linear Regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predictions
y_pred_lin = lin_reg.predict(X_test)

In [None]:
# Train Decision Tree model
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)

# Predictions
y_pred_tree = tree_reg.predict(X_test)

In [None]:
# Train Random Forest model
forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(X_train, y_train)

# Predictions
y_pred_forest = forest_reg.predict(X_test)

In [None]:
# Function to evaluate models
def evaluate_model(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

# Evaluate Linear Regression
mse_lin, r2_lin = evaluate_model(y_test, y_pred_lin)

# Evaluate Decision Tree
mse_tree, r2_tree = evaluate_model(y_test, y_pred_tree)

# Evaluate Random Forest
mse_forest, r2_forest = evaluate_model(y_test, y_pred_forest)

# Print results
print(f"Linear Regression: MSE = {mse_lin}, R2 = {r2_lin}")
print(f"Decision Tree: MSE = {mse_tree}, R2 = {r2_tree}")
print(f"Random Forest: MSE = {mse_forest}, R2 = {r2_forest}")

# ***CROSS-VALIDATION***

In [None]:
from sklearn.model_selection import cross_val_score

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    print(f"{name} - Cross-Validated MSE: {-scores.mean():.2f}")

# ***HYPERPARAMETER TUNING***

In [None]:
# Hyperparameter tuning for Decision Tree
param_grid_tree = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search_tree = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid_tree, cv=5, scoring='neg_mean_squared_error')
grid_search_tree.fit(X_train, y_train)

# Best parameters for Decision Tree
best_tree = grid_search_tree.best_estimator_
y_pred_best_tree = best_tree.predict(X_test)
mse_best_tree, r2_best_tree = evaluate_model(y_test, y_pred_best_tree)

print(f"Best Decision Tree: MSE = {mse_best_tree}, R2 = {r2_best_tree}")

# Hyperparameter tuning for Random Forest
param_grid_forest = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search_forest = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_forest, cv=5, scoring='neg_mean_squared_error')
grid_search_forest.fit(X_train, y_train)

# Best parameters for Random Forest
best_forest = grid_search_forest.best_estimator_
y_pred_best_forest = best_forest.predict(X_test)
mse_best_forest, r2_best_forest = evaluate_model(y_test, y_pred_best_forest)

print(f"Best Random Forest: MSE = {mse_best_forest}, R2 = {r2_best_forest}")

In [None]:
# Print results for all models
print(f"Linear Regression: MSE = {mse_lin}, R2 = {r2_lin}")
print(f"Decision Tree: MSE = {mse_tree}, R2 = {r2_tree}")
print(f"Best Decision Tree: MSE = {mse_best_tree}, R2 = {r2_best_tree}")
print(f"Random Forest: MSE = {mse_forest}, R2 = {r2_forest}")
print(f"Best Random Forest: MSE = {mse_best_forest}, R2 = {r2_best_forest}")

# Choose the best model based on the lowest MSE
best_model_name = "Linear Regression"
best_mse = mse_lin

if mse_best_tree < best_mse:
    best_model_name = "Best Decision Tree"
    best_mse = mse_best_tree

if mse_best_forest < best_mse:
    best_model_name = "Best Random Forest"
    best_mse = mse_best_forest

print(f"The best model is: {best_model_name} with MSE = {best_mse}")

In [None]:
# Plotting the predictions
plt.figure(figsize=(12, 6))

# Linear Regression
plt.subplot(1, 3, 1)
plt.scatter(y_test, y_pred_lin)
plt.title('Linear Regression Predictions')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')

# Decision Tree
plt.subplot(1, 3, 2)
plt.scatter(y_test, y_pred_tree)
plt.title('Decision Tree Predictions')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')

# Random Forest
plt.subplot(1, 3, 3)
plt.scatter(y_test, y_pred_forest)
plt.title('Random Forest Predictions')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')

plt.tight_layout()
plt.show()

# ***VISUALIZATIONS***

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# 3D Scatter Plot of Actual Sales
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
ax.scatter(data['TV'], data['Radio'], data['Sales'], c='blue', marker='o', alpha=0.6)

# Labels and title
ax.set_xlabel('TV Advertising Spend')
ax.set_ylabel('Radio Advertising Spend')
ax.set_zlabel('Sales')
ax.set_title('3D Scatter Plot of Actual Sales Data')

plt.show()

In [None]:
# 3D Scatter Plot of Predicted Sales
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot for predictions
ax.scatter(X_test['TV'], X_test['Radio'], y_pred_forest, c='red', marker='^', alpha=0.6, label='Predicted Sales')

# Scatter plot for actual sales
ax.scatter(X_test['TV'], X_test['Radio'], y_test, c='blue', marker='o', alpha=0.6, label='Actual Sales')

# Labels and title
ax.set_xlabel('TV Advertising Spend')
ax.set_ylabel('Radio Advertising Spend')
ax.set_zlabel('Sales')
ax.set_title('3D Scatter Plot of Predicted vs Actual Sales')
ax.legend()

plt.show()

In [None]:
# Create a grid for TV and Radio
TV_range = np.linspace(data['TV'].min(), data['TV'].max(), 100)
Radio_range = np.linspace(data['Radio'].min(), data['Radio'].max(), 100)
TV_grid, Radio_grid = np.meshgrid(TV_range, Radio_range)

# Predict sales for the grid
X_grid = pd.DataFrame({
    'TV': TV_grid.ravel(),
    'Radio': Radio_grid.ravel(),
    'Newspaper': np.mean(data['Newspaper'])  # Use mean Newspaper spend for predictions
})

# Predict using the Random Forest model
Sales_pred_grid = forest_reg.predict(X_grid).reshape(TV_grid.shape)

# 3D Surface Plot
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Surface plot
ax.plot_surface(TV_grid, Radio_grid, Sales_pred_grid, cmap='viridis', alpha=0.7)

# Labels and title
ax.set_xlabel('TV Advertising Spend')
ax.set_ylabel('Radio Advertising Spend')
ax.set_zlabel('Predicted Sales')
ax.set_title('3D Surface Plot of Predicted Sales')

plt.show()

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# ... your data loading code ...

fig = plt.figure()
ax = fig.add_subplot(projection='3d')

# Assuming your DataFrame has 'TV', 'Radio', 'Newspaper', and 'Sales' columns
ax.scatter(data['TV'], data['Radio'], data['Newspaper'], c=data['Sales'], cmap='viridis')

ax.set_xlabel('TV')
ax.set_ylabel('Radio')
ax.set_zlabel('Newspaper')
plt.show()

In [None]:
# Create a histogram for the 'TV' advertising budget
plt.figure(figsize=(10, 6))  # Set the figure size
data['TV'].plot.hist(bins=10, color='gold', edgecolor='black')  # Plot histogram

# Customize the plot
plt.title('Histogram of TV Advertising Budget', fontsize=16)  # Title of the histogram
plt.xlabel('TV Advertising Budget', fontsize=14)  # X-axis label
plt.ylabel('Frequency', fontsize=14)  # Y-axis label
plt.grid(axis='y', alpha=0.75)  # Add grid lines for better readability

# Show the plot
plt.show()

# ***Conclusion***
This structured approach allows you to evaluate the error rates of different models, perform hyperparameter tuning, and ultimately select the best model based on performance metrics. The model with the lowest Mean Squared Error (MSE) or highest RÂ² score is typically considered the best for the given dataset.