<a href="https://colab.research.google.com/github/nitekar/linear_regression_model/blob/main/multivariate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
adilshamim8_education_and_career_success_path = kagglehub.dataset_download('adilshamim8/education-and-career-success')

print('Data source import complete.')


# Linear Regression

The goal of this project is to build a linear regression model to predict work-life balance based on various factors. I will also compare the performance of linear regression with decision trees and random forest models to identify the best-performing model.

1. I’ll use Python with libraries like pandas, numpy, seaborn, and matplotlib to load the data and generate visualizations.
2. I'll implement three models: Linear Regression, Random Forest, and Decision Tree.
3. I’ll also include a function to predict from one data point and save the model with the lowest loss


# Import Libraries and Load the Dataset

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv('/kaggle/input/education-and-career-success/education_career_success.csv')

# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())


Dataset Preview:
  Student_ID  Age  Gender  High_School_GPA  SAT_Score  University_Ranking  \
0     S00001   24    Male             3.58       1052                 291   
1     S00002   21   Other             2.52       1211                 112   
2     S00003   28  Female             3.42       1193                 715   
3     S00004   25    Male             2.43       1497                 170   
4     S00005   22    Male             2.08       1012                 599   

   University_GPA    Field_of_Study  Internships_Completed  \
0            3.96              Arts                      3   
1            3.63               Law                      4   
2            2.63          Medicine                      4   
3            2.81  Computer Science                      3   
4            2.48       Engineering                      4   

   Projects_Completed  Certifications  Soft_Skills_Score  Networking_Score  \
0                   7               2                  9             

# Data Preprocessing

In [1]:
# Convert categorical variables to numerical (if any)
data = pd.get_dummies(data, drop_first=True)

# Correlation matrix to see which features are highly correlated with Work-Life Balance
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


NameError: name 'pd' is not defined

# Splitting the Data

In [None]:
# Selecting features and target variable
X = data.drop(['Work-Life Balance'], axis=1)
y = data['Work-Life Balance']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


NameError: name 'data' is not defined

# Training the Models

## 1. Linear Regression

In [None]:
# Linear Regression Model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predictions
y_pred_lr = lin_reg.predict(X_test)

# Metrics
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression MSE: {mse_lr}")
print(f"Linear Regression R^2: {r2_lr}")


## 2. Decision Tree

In [None]:
# Decision Tree Regressor
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)

# Predictions
y_pred_tree = tree_reg.predict(X_test)

# Metrics
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)
print(f"Decision Tree MSE: {mse_tree}")
print(f"Decision Tree R^2: {r2_tree}")


## 3. Random Forest

In [None]:
# Random Forest Regressor
forest_reg = RandomForestRegressor(random_state=42, n_estimators=100)
forest_reg.fit(X_train, y_train)

# Predictions
y_pred_forest = forest_reg.predict(X_test)

# Metrics
mse_forest = mean_squared_error(y_test, y_pred_forest)
r2_forest = r2_score(y_test, y_pred_forest)
print(f"Random Forest MSE: {mse_forest}")
print(f"Random Forest R^2: {r2_forest}")


# Model comparison and saving the best model

In [None]:
# Compare models based on MSE and R^2
models = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest'],
    'MSE': [mse_lr, mse_tree, mse_forest],
    'R^2': [r2_lr, r2_tree, r2_forest]
})

print("\nModel Comparison:")
print(models)

# Save the best model (Random Forest in this example)
import joblib
best_model = forest_reg
joblib.dump(best_model, 'best_model.pkl')


# Visualize the Loss Curve

In [None]:
# Plotting the actual vs predicted values for the best model
plt.figure(figsize=(8, 5))
plt.scatter(y_test, y_pred_forest, alpha=0.7)
plt.xlabel("Actual Work-Life Balance")
plt.ylabel("Predicted Work-Life Balance")
plt.title("Actual vs Predicted Work-Life Balance")
plt.show()

# Prediction Script


In [None]:
import joblib
import numpy as np

# Load the saved model
model = joblib.load('best_model.pkl')

def predict_work_life_balance(input_data):
    prediction = model.predict(np.array(input_data).reshape(1, -1))
    return prediction[0]

# Example usage
example_input = [24, 3.8, 2, 4, 5, 1, 4, 3]  # Adjust the input as per your features
print(f"Predicted Work-Life Balance: {predict_work_life_balance(example_input)}")
