In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import (MinMaxScaler, StandardScaler)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn import metrics
from math import sqrt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor, LinearRegression, Lasso, Ridge

## Regression Scools Test Scores


California Test Score Data
Description
The dataset contains data on test performance, school characteristics and student demographic backgrounds for school districts in California.

Usage
data("CASchools")
Format
A data frame containing 420 observations on 14 variables.

district
character. District code.

school
character. School name.

county
factor indicating county.

grades
factor indicating grade span of district.

students
Total enrollment.

teachers
Number of teachers.

calworks
Percent qualifying for CalWorks (income assistance).

lunch
Percent qualifying for reduced-price lunch.

computer
Number of computers.

expenditure
Expenditure per student.

income
District average income (in USD 1,000).

english
Percent of English learners.

read
Average reading score.

math
Average math score.

Details
The data used here are from all 420 K-6 and K-8 districts in California with data available for 1998 and 1999. Test scores are on the Stanford 9 standardized test administered to 5th grade students. School characteristics (averaged across the district) include enrollment, number of teachers (measured as “full-time equivalents”, number of computers per classroom, and expenditures per student. Demographic variables for the students are averaged across the district. The demographic variables include the percentage of students in the public assistance program CalWorks (formerly AFDC), the percentage of students that qualify for a reduced price lunch, and the percentage of students that are English learners (that is, students for whom English is a second language).

Source
Online complements to Stock and Watson (2007).

References
Stock, J. H. and Watson, M. W. (2007). Introduction to Econometrics, 2nd ed. Boston: Addison Wesley.

See Also
StockWatson2007, MASchools

In [2]:
# school= pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/AER/CASchools.csv')
school= pd.read_csv('CASchools.csv').iloc[:, 5:13]
#Percent qualifying for CalWorks (income assistance) must be dropped related to income variable
school.drop(columns=["calworks"], inplace=True)
school

Unnamed: 0,students,teachers,lunch,computer,expenditure,income,english
0,195,10.900000,2.040800,67,6384.911133,22.690001,0.000000
1,240,11.150000,47.916698,101,5099.380859,9.824000,4.583333
2,1550,82.900002,76.322601,169,5501.954590,8.978000,30.000002
3,243,14.000000,77.049202,85,7101.831055,8.978000,0.000000
4,1335,71.500000,78.427002,171,5235.987793,9.080333,13.857677
...,...,...,...,...,...,...,...
415,984,59.730000,3.556900,195,7290.338867,28.716999,5.995935
416,3724,208.479996,1.503800,721,5741.462891,41.734108,4.726101
417,441,20.150000,37.193802,45,4402.831543,23.733000,24.263039
418,101,5.000000,59.405899,14,4776.336426,9.952000,2.970297


In [3]:
school.isnull().sum()

students       0
teachers       0
lunch          0
computer       0
expenditure    0
income         0
english        0
dtype: int64

In [4]:
columns=list(school.columns)
for col in columns:
    print(col, ": ")
    print(school[col].unique())
    print()

students : 
[  195   240  1550   243  1335   137   888   379  2247   446   987   103
   487   649   852   491   421  6880  2688   440   475  2538   476  2357
  1588  7306  2601   847   452  4142  2102 10012  2488 25151  2267  1657
   284  5370  2471 15386   184  1217  6219  4258  1235 16244   814 27176
 10696  8935  1600  9028 10625  7151  2404  5804  2253  2807  3074   723
  5138 20927  3017   957  1639  4340  5079  6639  1154   237  2987   499
 11474  1088  2660   353   329   252   175  3835   314  4458  1313   474
  1114  1358 11629  6195   417   300   457   146   460   354  1841  3760
   500  5112  2141   610   337  4501  5718 19402  3401  2621   426   205
 13668   342  6518   239  2911  6272 10218  1735   544  1987   418   196
  2208  1255  1469  7114  1962  7761   216   224  7887   752  9328   548
   104   275   443 10337   806   227  8416   149   220  4612   590   133
  2440   519   222   285  3129  2019  5620  9775   246  7210 21338   477
   727   374 18255  8787   797   140   

In [5]:
standardScaler=StandardScaler()
X=school.drop(columns=["english"])
X=standardScaler.fit_transform(X)
y=school["english"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
print("X_train shape: ",X_train.shape)
print("X_test shape: ",X_test.shape)
print("y_train shape: ",y_train.shape)
print("y_test shape: ",y_test.shape)

X_train shape:  (294, 6)
X_test shape:  (126, 6)
y_train shape:  (294,)
y_test shape:  (126,)


### a)	Construct a linear regression model using ordinary least squares method by applying the .LinearRegression() constructor in sklearn and find the training and test accuracy of this model using mean square error (mse). https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html 

In [7]:
lr=LinearRegression()
lr=lr.fit(X_train, y_train)
y_train_pred=lr.predict(X_train)
y_test_pred=lr.predict(X_test)
print("Training: ", metrics.mean_squared_error(y_train, y_train_pred))
print("Test: ", metrics.mean_squared_error(y_test, y_test_pred))

Training:  133.1381102560794
Test:  217.5660605744103


### b)	Check for overfitting. Is there overfitting? Support your answer with some results you generated. 

In [8]:
print("Training RMSE: ", np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("Test RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Training RMSE:  11.538548879996972
Test RMSE:  14.750120696943815


Based on the results of MSE and RMSE there does not appear to be overfitting. The training MSE is lower than the testing MSE, but it could be argued that the difference is not significant. After tuning the model, the results should be improved.

### c)	Fit a lasso regression on the data and check the training and test accuracy of the model using mse. Use the default alpha or penalty constant. https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html

In [9]:
lasso= Lasso()
lasso=lasso.fit(X_train, y_train)
y_train_pred=lasso.predict(X_train)
y_test_pred=lasso.predict(X_test)
print("Training: ", metrics.mean_squared_error(y_train, y_train_pred))
print("Test: ", metrics.mean_squared_error(y_test, y_test_pred))

Training:  141.92810557923744
Test:  229.7032875678285


In [10]:
print("Training RMSE: ", np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("Test RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Training RMSE:  11.913358283004731
Test RMSE:  15.15596541193693


### d)	Fit a ridge regression on the data and check the training and test accuracy of the model. Use the default alpha or penalty constant. https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html 

In [11]:
ridge=Ridge()
ridge=ridge.fit(X_train, y_train)
y_train_pred=ridge.predict(X_train)
y_test_pred=ridge.predict(X_test)
print("Training: ", metrics.mean_squared_error(y_train, y_train_pred))
print("Test: ", metrics.mean_squared_error(y_test, y_test_pred))

Training:  133.15336562346812
Test:  217.9567427672624


In [12]:
print("Training RMSE: ", np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("Test RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Training RMSE:  11.53920992197768
Test RMSE:  14.76335811281642


### e)	Which model is better, OLS, Lasso, or Ridge regression? 

Linear and Ridge regression are similar. The Lasso regression appears to have performed the worst.

### f)	Tune the alpha hyperparameters of the lasso and ridge regression using any tuning technique of your choice? What is the best alpha value for the lasso regression and what is the best alpha value for the ridge regression?

#### Lasso Tuning

In [13]:
lasso.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [14]:
param_grid = {'alpha': np.arange(0.01, 1, .01),
              'max_iter':[1000,5000,10000,100000, 1000000],
             'random_state':[42] }
grid_search= GridSearchCV(lasso, param_grid, scoring="neg_mean_squared_error",cv=3)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
grid_search.best_score_

{'alpha': 0.09999999999999999, 'max_iter': 1000, 'random_state': 42}


-136.76351692817855

In [15]:
grid_search.score(X_train, y_train)

-133.4778165473199

In [16]:
grid_search.score(X_test, y_test)

-217.8053958967561

In [17]:
print("Training RMSE: ", np.sqrt(-1*grid_search.score(X_train, y_train)))
print("Test RMSE: ", np.sqrt(-1*grid_search.score(X_test, y_test)))

Training RMSE:  11.553259996525652
Test RMSE:  14.75823146236554


In [18]:
lassoBest= Lasso(alpha=0.09999999999999999, max_iter=1000, random_state=42)
lassoBest=lassoBest.fit(X_train, y_train)
y_train_pred=lassoBest.predict(X_train)
y_test_pred=lassoBest.predict(X_test)
print("Training: ", metrics.mean_squared_error(y_train, y_train_pred))
print("Test: ", metrics.mean_squared_error(y_test, y_test_pred))

Training:  133.4778165473199
Test:  217.8053958967561


In [19]:
print("Training RMSE: ", np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("Test RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Training RMSE:  11.553259996525652
Test RMSE:  14.75823146236554


#### Ridge Tuning

In [20]:
ridge.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [21]:
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'max_iter':[100,500,1000,5000, 10000],
             'random_state':[42] }
grid_search= GridSearchCV(ridge, param_grid, scoring="neg_mean_squared_error",cv=3)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
grid_search.best_score_

{'alpha': 10, 'max_iter': 100, 'random_state': 42}


-137.43347804629363

In [22]:
grid_search.score(X_train, y_train)

-133.76534064826066

In [23]:
grid_search.score(X_test, y_test)

-220.55318011435727

In [24]:
print("Training RMSE: ", np.sqrt(-1*grid_search.score(X_train, y_train)))
print("Test RMSE: ", np.sqrt(-1*grid_search.score(X_test, y_test)))

Training RMSE:  11.565696721264166
Test RMSE:  14.851032964556953


In [25]:
ridgeBest=Ridge(alpha=10, max_iter=100, random_state=42)
ridgeBest=ridgeBest.fit(X_train, y_train)
y_train_pred=ridgeBest.predict(X_train)
y_test_pred=ridgeBest.predict(X_test)
print("Training: ", metrics.mean_squared_error(y_train, y_train_pred))
print("Test: ", metrics.mean_squared_error(y_test, y_test_pred))

Training:  133.76534064826066
Test:  220.55318011435727


In [26]:
print("Training RMSE: ", np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("Test RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Training RMSE:  11.565696721264166
Test RMSE:  14.851032964556953


### 