# Possum - Age Prediction with Multiple Linear Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load dataset

In [2]:
df = pd.read_csv('possum.csv')
df.head()

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [3]:
X_total = df.iloc[:, 1:].values
print(X_total)

[[1 'Vic' 'm' ... 15.2 28.0 36.0]
 [1 'Vic' 'f' ... 16.0 28.5 33.0]
 [1 'Vic' 'f' ... 15.5 30.0 34.0]
 ...
 [7 'other' 'f' ... 13.0 25.0 30.0]
 [7 'other' 'm' ... 15.4 25.0 29.0]
 [7 'other' 'f' ... 14.8 28.5 33.5]]


## Preprocessing - taking care of missing data

In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X_total[:, 3:])
X_total[:, 3:] = imputer.transform(X_total[:, 3:])
print(X_total)

[[1 'Vic' 'm' ... 15.2 28.0 36.0]
 [1 'Vic' 'f' ... 16.0 28.5 33.0]
 [1 'Vic' 'f' ... 15.5 30.0 34.0]
 ...
 [7 'other' 'f' ... 13.0 25.0 30.0]
 [7 'other' 'm' ... 15.4 25.0 29.0]
 [7 'other' 'f' ... 14.8 28.5 33.5]]


## Preprocessing - reorder columns

In [5]:
# move the column age to the right as it should be the independent column
X_total = X_total[:, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 3]]
print(X_total)

[[1 'Vic' 'm' ... 28.0 36.0 8.0]
 [1 'Vic' 'f' ... 28.5 33.0 6.0]
 [1 'Vic' 'f' ... 30.0 34.0 6.0]
 ...
 [7 'other' 'f' ... 25.0 30.0 6.0]
 [7 'other' 'm' ... 25.0 29.0 4.0]
 [7 'other' 'f' ... 28.5 33.5 3.0]]


## Select dependent and independent variables

In [6]:
X = X_total[:, 0:-1]
y = X_total[:, -1]
print(X)
print(y)

[[1 'Vic' 'm' ... 15.2 28.0 36.0]
 [1 'Vic' 'f' ... 16.0 28.5 33.0]
 [1 'Vic' 'f' ... 15.5 30.0 34.0]
 ...
 [7 'other' 'f' ... 13.0 25.0 30.0]
 [7 'other' 'm' ... 15.4 25.0 29.0]
 [7 'other' 'f' ... 14.8 28.5 33.5]]
[8.0 6.0 6.0 6.0 2.0 1.0 2.0 6.0 9.0 6.0 9.0 5.0 5.0 3.0 5.0 4.0 1.0 2.0
 5.0 4.0 3.0 3.0 4.0 2.0 3.0 7.0 2.0 4.0 3.0 2.0 3.0 4.0 3.0 2.0 4.0 7.0
 2.0 7.0 1.0 3.0 5.0 3.0 2.0 3.8333333333333335 3.0 3.8333333333333335 2.0
 5.0 4.0 5.0 5.0 6.0 3.0 7.0 2.0 3.0 4.0 3.0 2.0 2.0 7.0 3.0 6.0 3.0 5.0
 3.0 4.0 5.0 5.0 7.0 6.0 1.0 1.0 4.0 6.0 5.0 6.0 1.0 1.0 1.0 3.0 4.0 3.0
 3.0 3.0 3.0 2.0 2.0 6.0 3.0 3.0 2.0 3.0 7.0 4.0 4.0 3.0 5.0 3.0 1.0 1.0
 6.0 4.0 3.0]


## Preprocessing - encode categorical data

In [7]:
# encode the columns Pop and sex (binary categorical data - we only need LabelEncoder)
from sklearn.preprocessing import LabelEncoder
le_Pop = LabelEncoder()
X[:, 1] = le_Pop.fit_transform(X[:, 1])
le_sex = LabelEncoder()
X[:, 2] = le_sex.fit_transform(X[:, 2])
print(X)

#encode the column site
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

[[1 0 1 ... 15.2 28.0 36.0]
 [1 0 0 ... 16.0 28.5 33.0]
 [1 0 0 ... 15.5 30.0 34.0]
 ...
 [7 1 0 ... 13.0 25.0 30.0]
 [7 1 1 ... 15.4 25.0 29.0]
 [7 1 0 ... 14.8 28.5 33.5]]
[[1.0 0.0 0.0 ... 15.2 28.0 36.0]
 [1.0 0.0 0.0 ... 16.0 28.5 33.0]
 [1.0 0.0 0.0 ... 15.5 30.0 34.0]
 ...
 [0.0 0.0 0.0 ... 13.0 25.0 30.0]
 [0.0 0.0 0.0 ... 15.4 25.0 29.0]
 [0.0 0.0 0.0 ... 14.8 28.5 33.5]]


## Splitting the data into training set and test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[0.0 0.0 0.0 ... 14.0 25.0 29.0]
 [0.0 0.0 0.0 ... 16.5 27.5 29.5]
 [0.0 1.0 0.0 ... 14.6 27.0 31.5]
 ...
 [0.0 0.0 0.0 ... 16.0 23.5 28.0]
 [1.0 0.0 0.0 ... 15.8 27.0 32.0]
 [0.0 1.0 0.0 ... 15.9 27.0 30.0]]
[[0.0 0.0 0.0 0.0 0.0 0.0 1.0 1 1 91.0 53.1 86.0 38.0 63.8 46.0 14.5 25.0
  31.5]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0 1 93.3 59.3 88.0 35.0 74.3 52.0 14.9 25.5
  36.0]
 [0.0 0.0 0.0 0.0 0.0 1.0 0.0 1 0 86.0 54.0 82.0 36.5 60.7 42.9 15.4 26.0
  32.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 0.0 1 0 91.3 57.7 88.0 39.0 63.1 47.0 14.4 26.0
  30.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0 0 84.7 51.5 75.0 34.0 68.7 53.4 13.0 25.0
  25.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0 0 91.0 55.0 84.5 36.0 72.8 51.4 13.6 27.0
  30.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 0.0 1 1 96.9 63.0 91.5 43.0 71.3 46.0 17.5 30.0
  36.5]
 [0.0 0.0 0.0 0.0 0.0 1.0 0.0 1 0 88.2 53.2 86.5 38.5 60.3 43.7 13.6 26.0
  31.0]
 [1.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 94.3 56.7 94.0 39.0 74.8 52.0 14.9 28.0
  34.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 0.0 1 0 95.1 59.4 93.0 41.

## Training the Multiple Linear Regression Model on the training set

In [10]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

## Predicting the test set results

In [11]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[2.863383105404516 3.0]
 [3.45933374410939 7.0]
 [3.5193730625432877 4.0]
 [1.3091318758314578 2.0]
 [2.1295012583171093 1.0]
 [3.4795213416615702 3.0]
 [4.145813169892325 7.0]
 [3.5839693611138372 3.0]
 [4.5726867166727665 4.0]
 [3.540226384741981 4.0]
 [4.3866754647952675 3.0]
 [2.5930911381594335 2.0]
 [2.3832724832920817 2.0]
 [3.4379746083598057 2.0]
 [3.1389328608128935 2.0]
 [4.187515786326433 6.0]
 [4.602157099490842 3.0]
 [4.599463603368875 3.0]
 [3.7880615533148188 2.0]
 [2.668285223115351 6.0]
 [4.651682809429854 5.0]]


## Getting the final linear regression equation with the values of the coefficients

In [12]:
print(regressor.coef_)
print(regressor.intercept_)

[ 0.61  0.64  0.16 -2.28  0.61  0.3  -0.03 -1.25 -0.09  0.13  0.05 -0.08
  0.28 -0.13 -0.03  0.19  0.26  0.07]
-16.47773534343975


## Evaluationg the model performance

In [13]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.14413852697480034

## Conclusion: model not suitable!