# Possum - Age Prediction with Polynomial Linear Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load dataset

In [2]:
df = pd.read_csv('possum.csv')
df.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [3]:
X_total = df.iloc[:, 1:].values
print(X_total)

[[1 'Vic' 'm' ... 15.2 28.0 36.0]
 [1 'Vic' 'f' ... 16.0 28.5 33.0]
 [1 'Vic' 'f' ... 15.5 30.0 34.0]
 ...
 [7 'other' 'f' ... 13.0 25.0 30.0]
 [7 'other' 'm' ... 15.4 25.0 29.0]
 [7 'other' 'f' ... 14.8 28.5 33.5]]


## Preprocessing - taking care of missing data

In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X_total[:, 3:])
X_total[:, 3:] = imputer.transform(X_total[:, 3:])
print(X_total)

[[1 'Vic' 'm' ... 15.2 28.0 36.0]
 [1 'Vic' 'f' ... 16.0 28.5 33.0]
 [1 'Vic' 'f' ... 15.5 30.0 34.0]
 ...
 [7 'other' 'f' ... 13.0 25.0 30.0]
 [7 'other' 'm' ... 15.4 25.0 29.0]
 [7 'other' 'f' ... 14.8 28.5 33.5]]


## Preprocessing - reorder columns

In [5]:
# move the column age to the right as it should be the independent column
X_total = X_total[:, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 3]]
print(X_total)

[[1 'Vic' 'm' ... 28.0 36.0 8.0]
 [1 'Vic' 'f' ... 28.5 33.0 6.0]
 [1 'Vic' 'f' ... 30.0 34.0 6.0]
 ...
 [7 'other' 'f' ... 25.0 30.0 6.0]
 [7 'other' 'm' ... 25.0 29.0 4.0]
 [7 'other' 'f' ... 28.5 33.5 3.0]]


## Select dependent and independent variables

In [6]:
X = X_total[:, 0:-1]
y = X_total[:, -1]
print(X)
print(y)

[[1 'Vic' 'm' ... 15.2 28.0 36.0]
 [1 'Vic' 'f' ... 16.0 28.5 33.0]
 [1 'Vic' 'f' ... 15.5 30.0 34.0]
 ...
 [7 'other' 'f' ... 13.0 25.0 30.0]
 [7 'other' 'm' ... 15.4 25.0 29.0]
 [7 'other' 'f' ... 14.8 28.5 33.5]]
[8.0 6.0 6.0 6.0 2.0 1.0 2.0 6.0 9.0 6.0 9.0 5.0 5.0 3.0 5.0 4.0 1.0 2.0
 5.0 4.0 3.0 3.0 4.0 2.0 3.0 7.0 2.0 4.0 3.0 2.0 3.0 4.0 3.0 2.0 4.0 7.0
 2.0 7.0 1.0 3.0 5.0 3.0 2.0 3.8333333333333335 3.0 3.8333333333333335 2.0
 5.0 4.0 5.0 5.0 6.0 3.0 7.0 2.0 3.0 4.0 3.0 2.0 2.0 7.0 3.0 6.0 3.0 5.0
 3.0 4.0 5.0 5.0 7.0 6.0 1.0 1.0 4.0 6.0 5.0 6.0 1.0 1.0 1.0 3.0 4.0 3.0
 3.0 3.0 3.0 2.0 2.0 6.0 3.0 3.0 2.0 3.0 7.0 4.0 4.0 3.0 5.0 3.0 1.0 1.0
 6.0 4.0 3.0]


## Preprocessing - encode categorical data

In [7]:
# encode the columns Pop and sex (binary categorical data - we only need LabelEncoder)
from sklearn.preprocessing import LabelEncoder
le_Pop = LabelEncoder()
X[:, 1] = le_Pop.fit_transform(X[:, 1])
le_sex = LabelEncoder()
X[:, 2] = le_sex.fit_transform(X[:, 2])
print(X)

[[1 0 1 ... 15.2 28.0 36.0]
 [1 0 0 ... 16.0 28.5 33.0]
 [1 0 0 ... 15.5 30.0 34.0]
 ...
 [7 1 0 ... 13.0 25.0 30.0]
 [7 1 1 ... 15.4 25.0 29.0]
 [7 1 0 ... 14.8 28.5 33.5]]


## Splitting the data into training set and test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[6 1 0 90.0 53.8 81.5 36.0 62.0 43.3 14.0 25.0 29.0]
 [5 1 1 93.3 57.6 85.0 36.5 64.7 44.1 16.5 27.5 29.5]
 [2 0 1 90.7 55.9 81.0 34.0 71.5 54.0 14.6 27.0 31.5]
 [2 0 0 90.0 55.5 81.0 32.0 72.0 49.4 13.4 29.0 31.0]
 [2 0 0 88.4 57.0 83.0 36.5 68.45922330097088 40.3 15.9 27.0 30.5]
 [6 1 1 88.4 54.6 80.5 36.0 62.6 43.6 16.3 25.0 28.5]
 [7 1 1 98.5 60.7 93.0 41.5 71.7 46.8 15.0 26.0 36.0]
 [3 1 1 95.4 59.2 85.0 37.0 69.0 45.0 15.9 29.5 35.5]
 [2 0 0 89.3 54.8 82.5 35.0 71.2 52.0 13.6 28.0 31.5]
 [1 0 0 93.3 57.2 89.5 39.0 77.2 51.3 14.9 31.0 34.0]
 [1 0 0 94.0 60.0 95.5 39.0 75.4 51.9 15.5 30.0 34.0]
 [1 0 1 91.4 54.6 89.0 37.0 70.8 51.8 14.8 24.0 30.0]
 [5 1 0 91.9 56.4 87.0 38.0 65.4 44.1 13.0 27.0 34.0]
 [1 0 0 94.8 56.3 89.0 38.0 73.8 52.4 15.5 27.0 36.0]
 [1 0 1 93.8 56.8 87.0 34.5 73.2 53.0 15.3 27.0 30.0]
 [5 1 0 92.0 56.4 88.5 38.0 64.1 46.3 15.2 25.5 28.5]
 [7 1 1 89.5 56.0 81.5 36.5 66.0 46.8 14.8 23.0 27.0]
 [6 1 1 85.8 50.0 81.0 36.5 62.8 43.0 14.8 22.0 28.5]
 [6 1 1 93.8 58

## Training the Polynomial Regression Model on the training set

In [10]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)

LinearRegression()

## Predicting the test set results

In [11]:
y_pred = regressor.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[0.6297512867725281 3.0]
 [16.332454468015197 7.0]
 [-0.12673832042874622 4.0]
 [4.644153329458543 2.0]
 [1.1423665125226243 1.0]
 [8.119823464338236 3.0]
 [4.503831571211133 7.0]
 [5.676031843122967 3.0]
 [10.514668322607388 4.0]
 [14.270019174315433 4.0]
 [5.918142629102743 3.0]
 [-0.7153706297051095 2.0]
 [1.3905689575190472 2.0]
 [16.23787567168419 2.0]
 [2.6059127147889676 2.0]
 [2.3536888431879497 6.0]
 [7.108799582304339 3.0]
 [-5.116037698144016 3.0]
 [3.0469992253368154 2.0]
 [-2.0798343999646995 6.0]
 [4.056848856125992 5.0]]


## Getting the final linear regression equation with the values of the coefficients

In [12]:
print(regressor.coef_)
print(regressor.intercept_)

[-8.57e-07  2.81e-07  5.13e-07 ...  7.19e-06  1.16e-05  3.87e-05]
-23.87120881231586


## Evaluationg the model performance

In [13]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

-10.217447997713725

## Conclusion: model not suitable!