In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Predict hdlgth of possum with simple linear regression

In [2]:
possum = pd.read_csv('possum.csv')

In [3]:
print(possum.head())

   case  site  Pop sex  age  hdlngth  skullw  totlngth  taill  footlgth  \
0     1     1  Vic   m  8.0     94.1    60.4      89.0   36.0      74.5   
1     2     1  Vic   f  6.0     92.5    57.6      91.5   36.5      72.5   
2     3     1  Vic   f  6.0     94.0    60.0      95.5   39.0      75.4   
3     4     1  Vic   f  6.0     93.2    57.1      92.0   38.0      76.1   
4     5     1  Vic   f  2.0     91.5    56.3      85.5   36.0      71.0   

   earconch   eye  chest  belly  
0      54.5  15.2   28.0   36.0  
1      51.2  16.0   28.5   33.0  
2      51.9  15.5   30.0   34.0  
3      52.2  15.2   28.0   34.0  
4      53.2  15.1   28.5   33.0  


In [4]:
print(possum.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    object 
 3   sex       104 non-null    object 
 4   age       102 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  103 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.5+ KB
None


### Drop column case

In [5]:
possum.drop(columns=['case'], inplace=True)

### Rearrange columns (move hdlgth to the right)

In [6]:
possum = possum.iloc[:, [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 4]]

In [7]:
print(possum.head())

   site  Pop sex  age  skullw  totlngth  taill  footlgth  earconch   eye  \
0     1  Vic   m  8.0    60.4      89.0   36.0      74.5      54.5  15.2   
1     1  Vic   f  6.0    57.6      91.5   36.5      72.5      51.2  16.0   
2     1  Vic   f  6.0    60.0      95.5   39.0      75.4      51.9  15.5   
3     1  Vic   f  6.0    57.1      92.0   38.0      76.1      52.2  15.2   
4     1  Vic   f  2.0    56.3      85.5   36.0      71.0      53.2  15.1   

   chest  belly  hdlngth  
0   28.0   36.0     94.1  
1   28.5   33.0     92.5  
2   30.0   34.0     94.0  
3   28.0   34.0     93.2  
4   28.5   33.0     91.5  


### Split the dataset into training and test set

In [8]:
possum_train, possum_test = train_test_split(possum, test_size=0.2, random_state=1)

In [9]:
print(possum_train)

    site    Pop sex  age  skullw  totlngth  taill  footlgth  earconch   eye  \
82     6  other   f  3.0    53.8      81.5   36.0      62.0      43.3  14.0   
66     5  other   m  4.0    57.6      85.0   36.5      64.7      44.1  16.5   
44     2    Vic   m  3.0    55.9      81.0   34.0      71.5      54.0  14.6   
42     2    Vic   f  2.0    55.5      81.0   32.0      72.0      49.4  13.4   
40     2    Vic   f  5.0    57.0      83.0   36.5       NaN      40.3  15.9   
..   ...    ...  ..  ...     ...       ...    ...       ...       ...   ...   
75     6  other   m  5.0    56.8      89.0   41.0      64.5      46.4  17.8   
9      1    Vic   f  6.0    58.0      89.5   37.5      70.9      53.4  14.4   
72     5  other   m  1.0    52.3      82.0   36.5      65.7      44.7  16.0   
12     1    Vic   m  5.0    59.9      89.5   36.0      71.0      49.8  15.8   
37     2    Vic   m  7.0    56.0      80.5   35.5      68.4      49.5  15.9   

    chest  belly  hdlngth  
82   25.0   29.0     90

In [10]:
print(possum_test)

     site    Pop sex  age  skullw  totlngth  taill  footlgth  earconch   eye  \
96      7  other   m  3.0    53.1      86.0   38.0      63.8      46.0  14.5   
35      2    Vic   m  7.0    59.3      88.0   35.0      74.3      52.0  14.9   
81      6  other   f  4.0    54.0      82.0   36.5      60.7      42.9  15.4   
59      4  other   f  2.0    57.7      88.0   39.0      63.1      47.0  14.4   
38      2    Vic   f  1.0    51.5      75.0   34.0      68.7      53.4  13.0   
39      2    Vic   f  3.0    55.0      84.5   36.0      72.8      51.4  13.6   
53      4  other   m  7.0    63.0      91.5   43.0      71.3      46.0  17.5   
85      6  other   f  3.0    53.2      86.5   38.5      60.3      43.7  13.6   
31      1    Vic   f  4.0    56.7      94.0   39.0      74.8      52.0  14.9   
56      4  other   f  4.0    59.4      93.0   41.0      67.2      45.3  14.5   
98      7  other   f  3.0    56.2      86.5   38.5      64.8      43.8  14.0   
46      3  other   m  2.0    54.8      8

### Handle missing values in train and test set

#### training set

In [11]:
print(possum_train.isna().sum())

site        0
Pop         0
sex         0
age         2
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
hdlngth     0
dtype: int64


#### calculate threshold for missing data

In [12]:
treshold_train = 0.05 * len(possum_train)
print(treshold_train)

4.15


As the number of missing values in both columns age and footlgth is below the treshold we can drop these observation.

#### drop missing data in training set

In [13]:
possum_train.dropna(axis=0, subset=['age', 'footlgth'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [14]:
print(possum_train.isna().sum())

site        0
Pop         0
sex         0
age         0
skullw      0
totlngth    0
taill       0
footlgth    0
earconch    0
eye         0
chest       0
belly       0
hdlngth     0
dtype: int64


#### test set

In [15]:
print(possum_test.isna().sum())

site        0
Pop         0
sex         0
age         0
skullw      0
totlngth    0
taill       0
footlgth    0
earconch    0
eye         0
chest       0
belly       0
hdlngth     0
dtype: int64


In test set we have no missing values to handle.

### Get X and y values for train and test set

In [16]:
X_train = possum_train.iloc[:, :-1].values
y_train = possum_train.iloc[:, -1].values

In [17]:
print(X_train)
print(y_train)

[[6 'other' 'f' 3.0 53.8 81.5 36.0 62.0 43.3 14.0 25.0 29.0]
 [5 'other' 'm' 4.0 57.6 85.0 36.5 64.7 44.1 16.5 27.5 29.5]
 [2 'Vic' 'm' 3.0 55.9 81.0 34.0 71.5 54.0 14.6 27.0 31.5]
 [2 'Vic' 'f' 2.0 55.5 81.0 32.0 72.0 49.4 13.4 29.0 31.0]
 [6 'other' 'm' 3.0 54.6 80.5 36.0 62.6 43.6 16.3 25.0 28.5]
 [7 'other' 'm' 2.0 60.7 93.0 41.5 71.7 46.8 15.0 26.0 36.0]
 [3 'other' 'm' 4.0 59.2 85.0 37.0 69.0 45.0 15.9 29.5 35.5]
 [2 'Vic' 'f' 2.0 54.8 82.5 35.0 71.2 52.0 13.6 28.0 31.5]
 [1 'Vic' 'f' 9.0 57.2 89.5 39.0 77.2 51.3 14.9 31.0 34.0]
 [1 'Vic' 'f' 6.0 60.0 95.5 39.0 75.4 51.9 15.5 30.0 34.0]
 [1 'Vic' 'm' 3.0 54.6 89.0 37.0 70.8 51.8 14.8 24.0 30.0]
 [5 'other' 'f' 7.0 56.4 87.0 38.0 65.4 44.1 13.0 27.0 34.0]
 [1 'Vic' 'f' 4.0 56.3 89.0 38.0 73.8 52.4 15.5 27.0 36.0]
 [1 'Vic' 'm' 4.0 56.8 87.0 34.5 73.2 53.0 15.3 27.0 30.0]
 [5 'other' 'f' 6.0 56.4 88.5 38.0 64.1 46.3 15.2 25.5 28.5]
 [7 'other' 'm' 1.0 56.0 81.5 36.5 66.0 46.8 14.8 23.0 27.0]
 [6 'other' 'm' 1.0 50.0 81.0 36.5 62.8 

In [18]:
X_test = possum_test.iloc[:, :-1].values
y_test = possum_test.iloc[:, -1].values

In [19]:
print(X_test)
print(y_test)

[[7 'other' 'm' 3.0 53.1 86.0 38.0 63.8 46.0 14.5 25.0 31.5]
 [2 'Vic' 'm' 7.0 59.3 88.0 35.0 74.3 52.0 14.9 25.5 36.0]
 [6 'other' 'f' 4.0 54.0 82.0 36.5 60.7 42.9 15.4 26.0 32.0]
 [4 'other' 'f' 2.0 57.7 88.0 39.0 63.1 47.0 14.4 26.0 30.0]
 [2 'Vic' 'f' 1.0 51.5 75.0 34.0 68.7 53.4 13.0 25.0 25.0]
 [2 'Vic' 'f' 3.0 55.0 84.5 36.0 72.8 51.4 13.6 27.0 30.0]
 [4 'other' 'm' 7.0 63.0 91.5 43.0 71.3 46.0 17.5 30.0 36.5]
 [6 'other' 'f' 3.0 53.2 86.5 38.5 60.3 43.7 13.6 26.0 31.0]
 [1 'Vic' 'f' 4.0 56.7 94.0 39.0 74.8 52.0 14.9 28.0 34.0]
 [4 'other' 'f' 4.0 59.4 93.0 41.0 67.2 45.3 14.5 31.0 39.0]
 [7 'other' 'f' 3.0 56.2 86.5 38.5 64.8 43.8 14.0 28.0 35.0]
 [3 'other' 'm' 2.0 54.8 89.0 37.5 66.0 45.5 15.0 25.0 33.0]
 [7 'other' 'm' 2.0 56.3 85.0 37.0 67.6 46.8 14.5 25.5 31.0]
 [4 'other' 'm' 2.0 62.8 96.0 40.0 73.2 44.5 14.7 32.0 36.0]
 [2 'Vic' 'm' 2.0 55.7 85.5 36.5 73.1 53.1 14.4 26.0 28.5]
 [7 'other' 'm' 6.0 58.4 84.5 35.0 64.4 46.2 14.4 29.0 30.5]
 [7 'other' 'f' 3.0 59.9 89.0 40.0

#### Encode categorical data in train and test set

In [20]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), [0, 1, 2])], remainder='passthrough')
ct.fit(X_train)
X_train = ct.transform(X_train)

In [21]:
print(X_train)

[[0.0 0.0 0.0 ... 14.0 25.0 29.0]
 [0.0 0.0 0.0 ... 16.5 27.5 29.5]
 [0.0 1.0 0.0 ... 14.6 27.0 31.5]
 ...
 [0.0 0.0 0.0 ... 16.0 23.5 28.0]
 [1.0 0.0 0.0 ... 15.8 27.0 32.0]
 [0.0 1.0 0.0 ... 15.9 27.0 30.0]]


In [22]:
X_test = ct.transform(X_test)

In [23]:
print(X_test)

[[0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 3.0 53.1 86.0 38.0 63.8
  46.0 14.5 25.0 31.5]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 7.0 59.3 88.0 35.0 74.3
  52.0 14.9 25.5 36.0]
 [0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 4.0 54.0 82.0 36.5 60.7
  42.9 15.4 26.0 32.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 2.0 57.7 88.0 39.0 63.1
  47.0 14.4 26.0 30.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 51.5 75.0 34.0 68.7
  53.4 13.0 25.0 25.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 3.0 55.0 84.5 36.0 72.8
  51.4 13.6 27.0 30.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 7.0 63.0 91.5 43.0 71.3
  46.0 17.5 30.0 36.5]
 [0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 3.0 53.2 86.5 38.5 60.3
  43.7 13.6 26.0 31.0]
 [1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 4.0 56.7 94.0 39.0 74.8
  52.0 14.9 28.0 34.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 4.0 59.4 93.0 41.0 67.2
  45.3 14.5 31.0 39.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 3.0 56.2 86.5 38.5 64.8


### Build the linear model

In [24]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

### Predict the test set

In [25]:
y_pred = regressor.predict(X_test)

### Compare y_test with y_pred and calculate r2_score

In [26]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 90.96131144  91.        ]
 [ 96.48975138  93.3       ]
 [ 87.86577821  86.        ]
 [ 92.54423073  91.3       ]
 [ 83.58344704  84.7       ]
 [ 89.38326709  91.        ]
 [ 99.63438905  96.9       ]
 [ 87.2196191   88.2       ]
 [ 94.04958682  94.3       ]
 [ 96.96225629  95.1       ]
 [ 91.07108287  93.3       ]
 [ 93.33188015  90.1       ]
 [ 92.35858413  90.7       ]
 [100.14739326 102.5       ]
 [ 90.87471477  90.6       ]
 [ 93.03938091  97.7       ]
 [ 93.16505111  93.6       ]
 [ 92.00258104  90.6       ]
 [ 94.26248964  93.5       ]
 [ 90.79142903  92.4       ]
 [ 93.55803772  94.1       ]]


In [27]:
print(r2_score(y_test, y_pred))

0.7316072303888254


### Print Coefficients

In [28]:
print(regressor.coef_)

[-0.80385246 -0.4869573  -0.00353835  1.90797346  0.06067321 -0.86448787
  0.19018932 -1.29080977  1.29080977 -0.4523902   0.4523902   0.16630072
  0.2388209   0.36948482 -0.42811745  0.19948739  0.18813432  0.49620825
 -0.06290524  0.27278609]
