In [None]:
#Mohsin Essani
#Assignment 3 Question 2
#Salary Data Linear Regression

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:

# Load the data into a Pandas dataframe
data = pd.read_csv("Salary_Data.csv")


In [None]:
data

In [None]:
# Perform EDA to understand the data
plt.scatter(data['YearsExperience'], data['Salary'])
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.show()

#It is clear in Scatter plot that when years of experience increases, salary also increases and it is a best fit line indicating a straight line. 

In [None]:

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data[['YearsExperience']], data['Salary'], test_size=0.2, random_state=42)


In [None]:
# Fit the linear regression model to the training data
reg = LinearRegression().fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = reg.predict(X_test)

In [None]:
# Evaluate the model's performance
print("R-squared score:", r2_score(y_test, y_pred))
print("Root mean squared error:", mean_squared_error(y_test, y_pred)**0.5)

In [None]:
# In simple linear regression, the R-squared score is a measure of how well the model fits the data. It ranges from 0 to 1, with a value of 1 indicating a perfect fit and a value of 0 indicating that the model does not explain any of the variation in the data. A higher R-squared score means that the model explains more of the variation in the data and is a better fit.

# The R-squared score of 0.9024461774180497 suggests that the model explains 90.24% of the variation in the data. This is a relatively high R-squared score, which indicates that the model is a good fit for the data.

# Root mean squared error (RMSE) is a measure of the difference between the predicted values and the actual values. Lower values of RMSE indicate that the model is a better fit for the data.

# The root mean squared error of 7059.04362190151 means that, on average, the model's predictions are off by around $7059.

# In summary, the R-squared score of 0.9024461774180497 and Root mean squared error of 7059.04362190151 suggests that the model is a good fit for the data and the predictions made by the model are close to the actual values.

# Using Transformations and predicting the model accuracy

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("Salary_Data.csv")

In [None]:

X = data['YearsExperience'].values.reshape(-1,1)
y = data['Salary'].values.reshape(-1,1)

In [None]:
# Transform the data
X_log = np.log(X + 1)
X_sqrt = np.sqrt(X)
X_cuberoot = np.cbrt(X)

In [None]:

# Fit the linear regression model
models = {
    'log': LinearRegression().fit(X_log, y),
    'sqrt': LinearRegression().fit(X_sqrt, y),
    'cuberoot': LinearRegression().fit(X_cuberoot, y),
    'original': LinearRegression().fit(X, y)
}


In [None]:
# Predict and calculate the mean squared error for each transformation
for name, model in models.items():
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    print(f'MSE for {name} transformation: {mse}')


# Using PolynomialFeatures Technique to improve the MSE model

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

# Import and prepare the data
df = pd.read_csv("Salary_Data.csv")



In [None]:
X = df.YearsExperience.values.reshape(-1,1)
y = df.Salary.values.reshape(-1,1)

In [None]:
# Transform the data
X_log = np.log(X)


In [None]:
# Create the polynomial features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_log)

In [None]:

# Fit the model
reg = LinearRegression()
reg.fit(X_poly, y)

In [None]:

# Make predictions
y_pred = reg.predict(X_poly)

In [None]:

# Calculate the MSE
mse = mean_squared_error(y, y_pred)
print("MSE for log transformation with polynomial regression:", mse)

# Using Ridge and Lasso Technique to observe the model accuracy

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv("Salary_Data.csv")

In [None]:

X = data[['YearsExperience']]
y = data['Salary']

In [None]:

# # Transform the data
X_log = np.log(X)
X_sqrt = np.sqrt(X)
X_cuberoot = np.cbrt(X)
X_square = np.square(X)

In [None]:
 X_transformed = pd.concat([X, X_log, X_sqrt, X_cuberoot, X_square], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [None]:
ridge = Ridge()
ridge.fit(X_train, y_train)

# # Fit the Ridge or Lasso model
# lasso = Lasso()
# lasso.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = ridge.predict(X_test)

In [None]:
# # Make predictions on the test set
y_pred = ridge.predict(X_test)

# y_pred = ridge.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
print("MSE: ", mse)

# Using Transformations and observing the accuracy

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# load data
data = pd.read_csv("Salary_Data.csv")

X = data.YearsExperience
y = data.Salary


In [None]:
def transform_and_evaluate(transformer):
    # Apply transformation to X
    X_transformed = transformer(X)
    X_transformed = X_transformed.to_numpy().reshape(-1,1)
    
    # Fit the model
    reg = LinearRegression().fit(X_transformed, y)
    
    # Make predictions
    y_pred = reg.predict(X_transformed)
    
    # Calculate MSE
    mse = mean_squared_error(y, y_pred)
    
    return mse



In [None]:

mse_log = transform_and_evaluate(np.log)
print("MSE for log transformation:", mse_log)

In [None]:

# Square root transformation
mse_sqrt = transform_and_evaluate(np.sqrt)
print("MSE for square root transformation:", mse_sqrt)

In [None]:

# Cube root transformation
mse_cbrt = transform_and_evaluate(np.cbrt)
print("MSE for cube root transformation:", mse_cbrt)


In [None]:
# No transformation
mse_original = transform_and_evaluate(lambda x: x)
print("MSE for no transformation:", mse_original)

# Using Ridge with Transformations and observing the model accuracy

In [None]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import pandas as pd

# Load the data
data = pd.read_csv("Salary_Data.csv")

X = data['YearsExperience']
y = data['Salary']



In [None]:
# Define a function to transform the data and evaluate the model
def transform_and_evaluate(transformer, alpha):
    # Apply transformation to X
    X_transformed = transformer(X)
    X_transformed = X_transformed.to_numpy().reshape(-1,1)
    
    # Fit the model
    reg = Ridge(alpha=alpha).fit(X_transformed, y)
    
    # Make predictions
    y_pred = reg.predict(X_transformed)
    
    # Calculate MSE
    mse = mean_squared_error(y, y_pred)
    
    return mse


In [None]:

# Define the different transformations to be tested
transformations = [np.log, np.sqrt, np.cbrt, lambda x: 1/x]
transformation_names = ['log', 'square root', 'cube root', 'reciprocal']


In [None]:
# Test each transformation and print the results
for i, trans in enumerate(transformations):
    mse = transform_and_evaluate(trans, 1)
    print("MSE for {} transformation: {:.2f}".format(transformation_names[i], mse))

# Best Model using Mean Squared, R2 score. Also calculating the boxcox and observing the GridSearchCV Class

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import boxcox
from sklearn.model_selection import GridSearchCV

# Load the data
data = pd.read_csv("Salary_Data.csv")


In [None]:
# Splitting the data into dependent and independent variables
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [None]:
# Applying log transformation
X_log = np.log(X)

# Applying square root transformation
X_sqrt = np.sqrt(X)

# Applying cube root transformation
X_cbrt = np.cbrt(X)

# Applying Box-Cox transformation
X = X.ravel()
X, _ = boxcox(X)


In [None]:
# Fitting the linear regression model to the transformed data
regressor = LinearRegression()
regressor.fit(X_log, y)
y_pred_log = regressor.predict(X_log)



In [None]:
regressor.fit(X_sqrt, y)
y_pred_sqrt = regressor.predict(X_sqrt)

regressor.fit(X_cbrt, y)
y_pred_cbrt = regressor.predict(X_cbrt)

# regressor.fit(X, y)
X = X.reshape(-1, 1)

regressor.fit(X, y)




X_boxcox = X_boxcox.reshape(-1, 1)
y_pred_boxcox = regressor.predict(X_boxcox)



# regressor = LinearRegression()

# # Define the hyperparameters to search over
# param_grid = {'fit_intercept': [True, False], 'normalize': [True, False]}

# # Create a GridSearchCV object
# grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='r2')

# # Fit the GridSearchCV object to the data
# X = X.reshape(-1, 1)
# grid_search.fit(X, y)

# # Print the best hyperparameters
# print('Best hyperparameters:', grid_search.best_params_)

#Best hyperparameters: {'fit_intercept': True, 'normalize': False}

In [None]:
# Calculating the R squared and MSE for each transformation
print("R squared for log transformation: ", r2_score(y, y_pred_log))
print("MSE for log transformation: ", mean_squared_error(y, y_pred_log))

print("R squared for square root transformation: ", r2_score(y, y_pred_sqrt))
print("MSE for square root transformation: ", mean_squared_error(y, y_pred_sqrt))


print("R squared for cube root transformation: ", r2_score(y, y_pred_cbrt))
print("MSE for cube root transformation: ", mean_squared_error(y, y_pred_cbrt))


print("Boxcox",boxcox(y,X_boxcox))