In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer, PowerTransformer, MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, KFold, LeaveOneGroupOut, GridSearchCV, RandomizedSearchCV 

from mlxtend.evaluate import bias_variance_decomp

ModuleNotFoundError: No module named 'mlxtend'

In [None]:
data_abalone = pd.read_csv('../Data/abalone/abalone.data', header=None)

In [None]:
data_abalone

In [None]:
data_abalone.shape

In [None]:
data_abalone.dtypes

In [None]:
data_abalone.columns = ['sex', 'length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'rings']

In [None]:
data_abalone

In [None]:
data_abalone.info()

In [None]:
df = data_abalone.copy()

In [None]:
df.info()

In [None]:
df[['sex']] = df[['sex']].astype('category')
df[['rings']] = df[['rings']].astype('float64')

In [None]:
df.dtypes

In [None]:
df = pd.get_dummies(df, columns=['sex'], dtype=np.float64)
df

In [None]:
df.rename(columns={'sex_F':'sex_f', 'sex_I':'sex_i', 'sex_M':'sex_m'}, inplace=True)
df

In [None]:
df.describe()

In [None]:
# Data distribution
plt.title('Price Distribution Plot')
sns.distplot(df['price'])
plt.show()

In [None]:
plt.figure(figsize=(11, 11))
sns.heatmap(df.corr(), annot=True, mask=np.triu(df.corr()))

In [None]:
df

In [None]:
plt.scatter(data_abalone['shell_weight'], data_abalone['rings'], alpha=0.3)

In [None]:
x_set = np.array(df.drop(['rings'], axis=1))
y_set = np.array(df['rings'])

x_set_mini = np.array(df['shell_weight'])
y_set_mini = np.array(df['rings'])

x_set.shape, y_set.shape, x_set_mini.shape, y_set_mini.shape

In [None]:
dfe = data_abalone.copy()
dfe[['sex']] = dfe[['sex']].astype('category')
dfe[['rings']] = dfe[['rings']].astype('float64')
dfe = pd.get_dummies(dfe, columns=['sex'], dtype=np.float64)
dfe.rename(columns={'sex_F':'sex_f', 'sex_I':'sex_i', 'sex_M':'sex_m'}, inplace=True)
x_set1 = np.array(df.drop(['rings'], axis=1))
y_set1 = np.array(df['rings'])
x_set1.shape, y_set1.shape

In [None]:
standard = MinMaxScaler()
y_set1 = standard.fit_transform(y_set1[:, None])

In [None]:
pd.DataFrame(x_set1).describe()

In [None]:
loss_hist = []
score_hist = []
lr = 0.03

n_epochs = 3000
d=2

polynomial = PolynomialFeatures(degree=d)
model = SGDRegressor(eta0=lr)

x_set_poly = polynomial.fit_transform(x_set)

for epoch in range(n_epochs):
    model.partial_fit(x_set_poly, y_set1.ravel())

    y_hat = model.predict(x_set_poly)
    loss_train = mean_absolute_error(y_set1, y_hat)
    loss_hist.append(loss_train)

    score = r2_score(y_set1, y_hat)
    score_hist.append(score)

    if (epoch+1) % 100 == 0:
        print(f'Epoch={epoch}, \t Loss={loss_train:.4},\t score={score:.4}')
        
print(f'Model weights: {model.coef_}')

In [None]:
q = 202
x_set_poly = polynomial.transform(x_set[[q], :])
y = model.predict(x_set_poly)
(y*29).round() , (y_set1[q]*29).round()

# test1

In [None]:
loss1_hist = []
score1_hist = []

loss2_hist = []
score2_hist = []

loss3_hist = []
score3_hist = []

In [None]:
lr = 0.01
n_epochs = 3000
random_seed = 14

model = SGDRegressor(eta0=lr, random_state=random_seed)


for epoch in range(n_epochs):
    model.partial_fit(x_set_mini[:, None], y_set_mini.ravel())

    y_hat = model.predict(x_set_mini[:, None])
    loss_train = mean_absolute_error(y_set_mini, y_hat)
    loss2_hist.append(loss_train)

    score = r2_score(y_set_mini, y_hat)
    score2_hist.append(score)

    if (epoch+1) % 100 == 0 or epoch<10:
        print(f'Epoch={epoch}, \t Loss={loss_train:.4},\t score={score:.4}')
        
print(f'Model weights: {model.coef_}')

In [None]:
fig, ax =plt.subplots(1, 2, figsize=(12, 6))

ax[0].plot(loss1_hist, label='lr=0.1')
ax[0].plot(loss2_hist, label='lr=0.01')
ax[0].plot(loss3_hist, label='lr=0.001')
ax[0].legend()
ax[0].grid()
ax[0].set_title('Learning Curve for Loss')

ax[1].plot(score1_hist, label='lr=0.1')
ax[1].plot(score2_hist, label='lr=0.01')
ax[1].plot(score3_hist, label='lr=0.001')
ax[1].legend()
ax[1].grid()
ax[1].set_title('Learning Curve for R2_score');

In [None]:
plt.plot(loss1_hist, label='lr=0.1')
plt.plot(loss2_hist, label='lr=0.01')
plt.plot(loss3_hist, label='lr=0.001')
plt.legend()
plt.grid()
plt.title('Learning Curve for Loss');

In [None]:
plt.plot(score1_hist, label='lr=0.1')
plt.plot(score2_hist, label='lr=0.01')
plt.plot(score3_hist, label='lr=0.001')
plt.legend()
plt.grid()
plt.title('Learning Curve for R2_score');

In [None]:
x = np.linspace(x_set_mini.min(), x_set_mini.max(), 100)[:, None]
y_hat = model.predict(x)

plt.scatter(x_set_mini, y_set_mini, alpha=0.2)
plt.plot(x, y_hat, 'r', linewidth=3)
plt.title('linear regression on data');

# test2

In [None]:
loss_hist = [[], [], []]
score_hist = [[], [], []]

In [None]:
lr = 0.01
n_epochs = 3000
ds=[1, 2, 3]
random_seed = 14
polynomials = []
models = []

for d in ds: 
    polynomial = PolynomialFeatures(degree=d)
    model = SGDRegressor(eta0=lr, random_state=random_seed)

    x_set_poly = polynomial.fit_transform(x_set_mini[:, None])

    for epoch in range(n_epochs):
        model.partial_fit(x_set_poly, y_set_mini.ravel())

        y_hat = model.predict(x_set_poly)
        loss_train = mean_absolute_error(y_set_mini, y_hat)
        loss_hist[d-1].append(loss_train)

        score = r2_score(y_set_mini, y_hat)
        score_hist[d-1].append(score)

        if (epoch+1) % 100 == 0 or epoch<10:
            print(f'd={d}, Epoch={epoch}, \t Loss={loss_train:.4},\t score={score:.4}')
    
    polynomials.append(polynomial)
    models.append(model)

    print()
    print()

In [None]:
c= ['r', 'orange', 'g']
fig, ax =plt.subplots(1, 2, figsize=(12, 6))

for d in ds:
    ax[0].plot(loss_hist[d-1], color=c[d-1], label=f'd={d}')
    ax[1].plot(score_hist[d-1], color=c[d-1], label=f'd={d}')

ax[0].legend()
ax[0].grid()
ax[0].set_title('Learning Curve for Loss')
ax[1].legend()
ax[1].grid()
ax[1].set_title('Learning Curve for R2_score');

In [None]:
plt.scatter(x_set_mini, y_set_mini, alpha=0.2)
c= ['r', 'orange', 'g']
for d in ds:
    x = np.linspace(x_set_mini.min(), x_set_mini.max(), 100)[:, None]
    xp = polynomials[d-1].transform(x)
    y_hat = models[d-1].predict(xp)
    plt.plot(x, y_hat, color=c[d-1],linewidth=3, label=f'd={d}')

plt.legend()
plt.title('polynomial regression on data');

# test3

In [None]:
loss_hist = [[], [], [], []]
score_hist = [[], [], [], []]

In [None]:
func = lambda x: 1/(x + 0.9)**3
log = lambda x: np.log(x+0.3)

In [None]:
lr = 0.01
n_epochs = 1000
fs={'power_transform':'yeo_johnson', 'log(x+0.3)':log, 'exp(x)':np.exp, '1/(x+0.9)^3':func}
random_seed = 14
basis_function = []
models = []

for i, (k, f) in enumerate(fs.items()): 
    if i==0:
        basis = PowerTransformer()
    else:
        basis = FunctionTransformer(f)

    model = SGDRegressor(eta0=lr, random_state=random_seed)

    x_set_poly = basis.fit_transform(x_set_mini[:, None])

    for epoch in range(n_epochs):
        model.partial_fit(x_set_poly, y_set_mini.ravel())

        y_hat = model.predict(x_set_poly)
        loss_train = mean_absolute_error(y_set_mini, y_hat)
        loss_hist[i].append(loss_train)

        score = r2_score(y_set_mini, y_hat)
        score_hist[i].append(score)

        if (epoch+1) % 100 == 0 or epoch<10:
            print(f'f={k}, Epoch={epoch}, \t Loss={loss_train:.4},\t score={score:.4}')
    
    basis_function.append(basis)
    models.append(model)

    print()
    print()

In [None]:
c= ['r', 'orange', 'g', 'c']
fig, ax =plt.subplots(1, 2, figsize=(12, 6))

for i, (k, v) in enumerate(fs.items()):
    ax[0].plot(loss_hist[i], color=c[i], label=f'function={k}')
    ax[1].plot(score_hist[i], color=c[i], label=f'function={k}')

ax[0].legend()
ax[0].grid()
ax[0].set_title('Learning Curve for Loss')
ax[1].legend()
ax[1].grid()
ax[1].set_title('Learning Curve for R2_score');

In [None]:
c= ['r', 'orange', 'g', 'c']
plt.scatter(x_set_mini, y_set_mini, alpha=0.2)

for i, (k, f) in enumerate(fs.items()):
    x = np.linspace(x_set_mini.min(), x_set_mini.max(), 100)[:, None]
    xp = basis_function[i].transform(x)
    y_hat = models[i].predict(xp)
    plt.plot(x, y_hat, color=c[i],linewidth=3, label=f'function={k}')

plt.legend()
plt.title('Other basis function regression on data');

# test4

In [None]:
x_set_mini = np.array(df['shell_weight'])
y_set_mini = np.array(df['rings'])

x_mini, _, y_mini, _ = train_test_split(x_set_mini, y_set_mini, test_size=0.8801, random_state=14, shuffle=True)

x_mini.shape, y_mini.shape

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x_mini, y_mini, test_size=0.8, random_state=14, shuffle=True)
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

In [None]:
plt.scatter(x_train, y_train, alpha=1)
plt.scatter(x_valid, y_valid, alpha=0.3)

In [None]:
loss_train_hist = [[]]
score_train_hist = [[]]

loss_valid_hist = [[]]
score_valid_hist = [[]]

In [None]:
func = lambda x: 1/(x + 0.9)**3

In [None]:
lr = 0.01
n_epochs = 1000
fs={'1/(x+0.9)^3':func}
random_seed = 14
basis_function = []
models = []

for i, (k, f) in enumerate(fs.items()): 
        
    basis = FunctionTransformer(f)

    model = SGDRegressor(eta0=lr, random_state=random_seed)

    x_train_poly = basis.fit_transform(x_train[:, None])
    x_valid_poly = basis.transform(x_valid[:, None])

    for epoch in range(n_epochs):
        model.partial_fit(x_train_poly, y_train.ravel())

        #train
        y_hat = model.predict(x_train_poly)
        loss_train = mean_absolute_error(y_train, y_hat)
        loss_train_hist[i].append(loss_train)

        score_train = r2_score(y_train, y_hat)
        score_train_hist[i].append(score_train)

        #valid
        y_hat = model.predict(x_valid_poly)
        loss_valid = mean_absolute_error(y_valid, y_hat)
        loss_valid_hist[i].append(loss_valid)

        score_valid = r2_score(y_valid, y_hat)
        score_valid_hist[i].append(score_valid)

        

        if (epoch+1) % 100 == 0 or epoch<10:
            print(f'f={k}, Epoch={epoch}, \t Loss_train={loss_train:.4},\t score_train={score_train:.4}, \t Loss_valid={loss_valid:.4},\t score_valid={score_valid:.4}')
    
    basis_function.append(basis)
    models.append(model)

    print()
    print()

In [None]:

fig, ax =plt.subplots(1, 2, figsize=(12, 6))

for i, (k, v) in enumerate(fs.items()):
    ax[0].plot(loss_train_hist[i], label=f'train={k}')
    ax[1].plot(score_train_hist[i], label=f'train={k}')

    ax[0].plot(loss_valid_hist[i], label=f'valid={k}')
    ax[1].plot(score_valid_hist[i], label=f'valid={k}')

ax[0].legend()
ax[0].grid()
ax[0].set_title('Learning Curve for Loss')
ax[1].legend()
ax[1].grid()
ax[1].set_title('Learning Curve for R2_score');

In [None]:

plt.scatter(x_mini, y_mini, alpha=0.2)

for i, (k, f) in enumerate(fs.items()):
    x = np.linspace(x_mini.min(), x_mini.max(), 100)[:, None]
    xp = basis_function[i].transform(x)
    y_hat = models[i].predict(xp)
    plt.plot(x, y_hat,linewidth=3, label=f'function={k}')

plt.legend()
plt.title('Other basis function regression on data');