In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# import warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
df =  pd.read_csv('possum.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    object 
 3   sex       104 non-null    object 
 4   age       102 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  103 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.5+ KB


In [None]:
df[df['age'].isnull()]

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
43,44,2,Vic,m,,85.1,51.5,76.0,35.5,70.3,52.6,14.4,23.0,27.0
45,46,2,Vic,m,,91.4,54.4,84.0,35.0,72.8,51.2,14.4,24.5,35.0


In [None]:
df['age'].fillna(df[df['sex']=='m']['age'].median(), inplace=True)

In [None]:
df[df['footlgth'].isnull()]

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
40,41,2,Vic,f,5.0,88.4,57.0,83.0,36.5,,40.3,15.9,27.0,30.5


In [None]:
df['footlgth'].fillna(df[df['sex']=='f']['footlgth'].median(), inplace = True)

In [None]:
df['Pop'].unique()

array(['Vic', 'other'], dtype=object)

In [None]:
df['Pop'] = df.Pop.map({'Vic':1,'other':0})

In [None]:
df['sex'].unique()

array(['m', 'f'], dtype=object)

In [None]:
df['sex'] = df.sex.map({'m':1,'f':0})

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    int64  
 3   sex       104 non-null    int64  
 4   age       104 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  104 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(4)
memory usage: 11.5 KB


In [None]:
X = df.drop('footlgth',axis = 1)
Y = df['footlgth']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

Without scalling

In [None]:
SVR_model = SVR()
cross_val = cross_val_score(SVR_model, X_train, Y_train, cv=5)
print("The train score is:", cross_val.mean())
SVR_model.fit(X_train,Y_train)
y_pred = SVR_model.predict(X_test)
print("The test score is:", SVR_model.score(X_test,Y_test))

The train score is: 0.5619053983054147
The test score is: 0.6347178305659376


In [None]:
grid = {
    'C':[0.1,1,10,100],
    'gamma':[1,0.1,0.01,0.001],
    'kernel':['linear','poly','sigmoid','rbf','laplacian'],
    'degree':[1,2,3,4,5]
}
SVR_model = SVR()
cross_grid = GridSearchCV(SVR_model,grid,cv=3)
cross_grid.fit(X_train,Y_train)
print(cross_grid.best_params_)
print(cross_grid.best_score_)

{'C': 100, 'degree': 1, 'gamma': 1, 'kernel': 'linear'}
0.6717942864859411


In [None]:
def scaling(data, scaler):
  scaler = scaler()
  scaler.fit(data)
  return scaler.transform(data)

In [None]:
X_train = scaling(X_train, MinMaxScaler)
X_test = scaling(X_test, MinMaxScaler)
SVR_model = SVR()
cross_val = cross_val_score(SVR_model, X_train, Y_train, cv=5)
print("The train score is:", cross_val.mean())
SVR_model.fit(X_train,Y_train)
y_pred = SVR_model.predict(X_test)
print("The test score is:", SVR_model.score(X_test,Y_test))

The train score is: 0.6666815966574409
The test score is: 0.6611764615344866


In [None]:
grid = {
    'C':[0.1,1,10,100],
    'gamma':[1,0.1,0.01,0.001],
    'kernel':['linear','poly','sigmoid','rbf','laplacian'],
    'degree':[1,2,3,4,5]
}
SVR_model = SVR()
cross_grid = GridSearchCV(SVR_model,grid,cv=3)
cross_grid.fit(X_train,Y_train)
print(cross_grid.best_params_)
print(cross_grid.best_score_)

{'C': 100, 'degree': 1, 'gamma': 0.1, 'kernel': 'poly'}
0.7510384565694578


240 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
s

In [None]:
X_train = scaling(X_train,StandardScaler)
X_test = scaling(X_test, StandardScaler)
SVR_model = SVR()
cross_val = cross_val_score(SVR_model, X_train, Y_train, cv=5)
print("The train score is:", cross_val.mean())
SVR_model.fit(X_train,Y_train)
y_pred = SVR_model.predict(X_test)
print("The test score is:", SVR_model.score(X_test,Y_test))

The train score is: 0.6221077173726448
The test score is: 0.643688300500222


In [None]:
grid = {
    'C':[0.1,1,10,100],
    'gamma':[1,0.1,0.01,0.001],
    'kernel':['linear','poly','sigmoid','rbf','laplacian'],
    'degree':[1,2,3,4,5]
}
SVR_model = SVR()
cross_grid = GridSearchCV(SVR_model,grid,cv=3)
cross_grid.fit(X_train,Y_train)
print(cross_grid.best_params_)
print(cross_grid.best_score_)

{'C': 100, 'degree': 1, 'gamma': 0.01, 'kernel': 'sigmoid'}
0.7373286662090917


240 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
s