In [91]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import pickle

In [92]:
# Read data
df = pd.read_csv('./homework/tekoäly_oppimistehtävä/datasets/fish_data.csv')

# Drop rows with missing values
df = df.dropna()

y = df['life_span']
X = df[['average_length(inches))',
        'average_weight(inches))',
        'habitat',
        'color',
        'ph_of_water',
        'Gender' # true = male, false = female
        ]]

y = y.values.reshape(-1, 1)

In [93]:
# Statistics
print('Statistics:')
print(X.describe())

# Info
print('Info:')
print(X.info())

Statistics:
       average_length(inches))  average_weight(inches))  ph_of_water
count              1976.000000              1976.000000  1976.000000
mean                 10.557586                10.449297     7.014727
std                   5.525760                 4.898631     0.577608
min                   1.000000                 2.000000     6.000000
25%                   5.850000                 6.127500     6.500000
50%                  10.680000                10.455000     7.000000
75%                  15.172500                14.700000     7.500000
max                  20.000000                18.960000     8.000000
Info:
<class 'pandas.core.frame.DataFrame'>
Index: 1976 entries, 0 to 1999
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   average_length(inches))  1976 non-null   float64
 1   average_weight(inches))  1976 non-null   float64
 2   habitat                  1976 non-null 

In [94]:
# One hot encoding
X_org = X
ct = ColumnTransformer(transformers=[('encoder',
OneHotEncoder(drop='first'), ['habitat', 'color', 'Gender'])], remainder='passthrough')
X = ct.fit_transform(X)

In [95]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    random_state = 0)

In [96]:
# Scale data
sc = StandardScaler(with_mean=False)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [97]:
# Training the Multiple Linear Regression model on the Training set
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

In [98]:
# Predicting the Test set results
y_pred = model.predict(X_test)

In [99]:
# Regression metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mea = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mea)

print(f'r2:  {round(r2,4)}')
print(f'mae: {round(mae,4)}')
print(f'mea: {round(mea,4)}')
print(f'rmse: {round(rmse,4)}')

r2:  -1.1561
mae: 1.142
mea: 1.9531
rmse: 1.3975


In [100]:
# Saving model
with open('./homework/tekoäly_oppimistehtävä/models/lifespan-dt-reg.pickle', 'wb') as f:
    pickle.dump(model, f)