In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm

# Read the data
data = pd.read_csv('salary_data_new.csv')
data = pd.get_dummies(data, columns=['education'], drop_first=True, dtype=int)

# (1) Use all features
X1 = data.drop(columns=['salary'])
y = data['salary']
model1 = sm.OLS(y, sm.add_constant(X1)).fit()
print(model1.summary())

# (2) Use only age with 2nd order polynomial
poly = PolynomialFeatures(degree=2)
age_poly = poly.fit_transform(data[['age']])
X2 = pd.DataFrame(age_poly, columns=poly.get_feature_names_out(['age']))
model2 = sm.OLS(y, sm.add_constant(X2)).fit()
print(model2.summary())

# (3) Use only Midterm with 2nd order polynomial
midterm_poly = poly.fit_transform(data[['Midterm']])
X3 = pd.DataFrame(midterm_poly, columns=poly.get_feature_names_out(['Midterm']))
model3 = sm.OLS(y, sm.add_constant(X3)).fit()
print(model3.summary())

# (4) Use both age and Midterm with 2nd order polynomial|
combined_poly = poly.fit_transform(data[['age', 'Midterm']])
X4 = pd.DataFrame(combined_poly, columns=poly.get_feature_names_out(['age', 'Midterm']))
model4 = sm.OLS(y, sm.add_constant(X4)).fit()
print(model4.summary())

# (5) Use both age and Midterm with 2nd order polynomial and education
combined_poly_edu = poly.fit_transform(data[['age', 'Midterm']])
X5 = pd.DataFrame(combined_poly_edu, columns=poly.get_feature_names_out(['age', 'Midterm']))
X5 = pd.concat([X5, data[['education_bachelor', 'education_highschool']]], axis=1)
model5 = sm.OLS(y, sm.add_constant(X5)).fit()
print(model5.summary())
