In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('Fish.csv')
df.head(5)

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,,34.0,12.444,5.134


In [5]:
feature_columns = ["Species", "Length1", "Length2", "Length3", "Height", "Width"]
label_column = "Weight"
features = df.loc[:, feature_columns]
label = df.loc[:, label_column]
X = features.values
y = label.values

# missing value

In [6]:
species_impute = SimpleImputer(np.nan, strategy = "most_frequent")
X[:, 0] = species_impute.fit_transform(X[:, 0].reshape(-1,1)).ravel()

continous_impute = SimpleImputer(np.nan, strategy = "mean")
X[:, 1:4] = continous_impute.fit_transform(X[:, 1:4])
new_df = pd.DataFrame.from_records(X, columns = feature_columns)
new_df.isnull().sum()

Species    0
Length1    0
Length2    0
Length3    0
Height     0
Width      0
dtype: int64

# Chuan hoa

In [7]:
encoder = LabelEncoder()
X[:, 0] = encoder.fit_transform(X[:, 0])
#encoder = OneHotEncoder()
#onehot = encoder.fit_transform(X[:, 0].reshape(-1,1)).toarray()
#X = np.delete(X, 0, axis=1)
#X = np.concatenate((onehot, X), axis=1)

# train/test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [9]:
ones = np.ones((X_train.shape[0], 1))
X_bar = np.concatenate((ones, X_train), axis=1)
inverse = np.linalg.pinv(np.asarray(np.dot(X_bar.T, X_bar), dtype='float'))
w = np.dot(np.dot(X_bar.T, y_train), inverse)
w

array([-510.8059899475975, -78.32486284922015, -55.388481797012446,
       -117.33453770497772, 8.691661128654367e-14, -144.02482553963773,
       1.414829774634384e-13, -115.73328205736834, -13.880329075183283,
       22.536031986333, 10.967804084683507, -2.331211437396405,
       91.08930421664627], dtype=object)

In [10]:
from sklearn.linear_model import LinearRegression


In [11]:
linear = LinearRegression()

linear.fit(X_train, y_train)
linear.coef_

array([ 2.38363351e+01,  4.67727162e+01, -1.51733397e+01,  3.55271368e-15,
       -4.18636275e+01, -1.42108547e-14, -1.35720841e+01, -1.38803291e+01,
        2.25360320e+01,  1.09678041e+01, -2.33121144e+00,  9.10893042e+01])

# test

In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [13]:
predict = linear.predict(X_test)
train = linear.predict(X_train)

print("Mean Squared Test Error: {}".format(np.sqrt(mean_squared_error(y_test, predict))))
print("Mean Squared Train Error: {}".format(np.sqrt(mean_squared_error(y_train, train))))

Mean Squared Test Error: 235.3284400756357
Mean Squared Train Error: 79.9764512948523


In [14]:
#regularization

In [15]:
from sklearn.linear_model import Ridge

In [16]:
ridge = Ridge(alpha = 1)
ridge.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [17]:
predict = ridge.predict(X_test)
train = ridge.predict(X_train)

print("Mean Squared Test Error: {}".format(np.sqrt(mean_squared_error(y_test, predict))))
print("Mean Squared Train Error: {}".format(np.sqrt(mean_squared_error(y_train, train))))

Mean Squared Test Error: 236.5678762363247
Mean Squared Train Error: 80.03243685417725
