In [1]:
import pandas as pd 
import numpy as np 
from sklearn .pipeline import Pipeline
from sklearn .model_selection import train_test_split
from sklearn .preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn .impute import SimpleImputer
from sklearn .compose import ColumnTransformer
import seaborn as sns 
from sklearn .linear_model import LogisticRegression
from sklearn .feature_selection import SelectKBest,chi2

In [2]:
data=sns.load_dataset("penguins")

In [3]:
data=pd.DataFrame(data)

In [4]:
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [5]:
data.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [7]:
data.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [8]:
data.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

# use transformations 

In [10]:
data['species'].value_counts()

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64

In [11]:
data["species"]=data["species"].replace({"Chinstrap":"Adelie"})

In [12]:
data["species"].value_counts()

species
Adelie    220
Gentoo    124
Name: count, dtype: int64

In [13]:
x=data.iloc[:,1:]
y=data.iloc[:,0]

In [14]:
x.head(2)

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Torgersen,39.5,17.4,186.0,3800.0,Female


In [15]:
y.head(2)

0    Adelie
1    Adelie
Name: species, dtype: object

In [16]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=42)

In [17]:
x_train

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
66,Biscoe,35.5,16.2,195.0,3350.0,Female
229,Biscoe,46.8,15.4,215.0,5150.0,Male
7,Torgersen,39.2,19.6,195.0,4675.0,Male
140,Dream,40.2,17.1,193.0,3400.0,Female
323,Biscoe,49.1,15.0,228.0,5500.0,Male
...,...,...,...,...,...,...
188,Dream,47.6,18.3,195.0,3850.0,Female
71,Torgersen,39.7,18.4,190.0,3900.0,Male
106,Biscoe,38.6,17.2,199.0,3750.0,Female
270,Biscoe,46.6,14.2,210.0,4850.0,Female


In [18]:
y_train

66     Adelie
229    Gentoo
7      Adelie
140    Adelie
323    Gentoo
        ...  
188    Adelie
71     Adelie
106    Adelie
270    Gentoo
102    Adelie
Name: species, Length: 275, dtype: object

In [19]:
data.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [20]:
x_train.head(1)

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
66,Biscoe,35.5,16.2,195.0,3350.0,Female


In [21]:
data["island"].value_counts()

island
Biscoe       168
Dream        124
Torgersen     52
Name: count, dtype: int64

In [22]:
data["sex"].value_counts()

sex
Male      168
Female    165
Name: count, dtype: int64

In [23]:
step_1=ColumnTransformer([
    ("bill_length_mm",SimpleImputer(),[1]),
    ("bill_depth_mm",SimpleImputer(),[2]),
    ("flipper_length_mm",SimpleImputer(),[3]),
    ("body_mass_g",SimpleImputer(),[4]),
    ("sex",SimpleImputer(strategy="most_frequent"),[5]),
],remainder="passthrough")

In [24]:
step_2=ColumnTransformer([

("sex_island",OneHotEncoder(handle_unknown="ignore",sparse_output=False),[0,5])],remainder="passthrough")

In [25]:
step_3=ColumnTransformer([
    ("MinMaxScaler",MinMaxScaler(),slice(0,11))
])

In [26]:
step_4=SelectKBest(score_func=chi2,k=7)

In [27]:
step_5=LogisticRegression()

# use piplines 

In [29]:
pip=Pipeline([
    ("step_1",step_1),
    ("step_2",step_2),
    ("step_3",step_3),
    ("step_4",step_4),
    ("step_5",step_5)
])

In [30]:
pip.fit(x_train,y_train)

0,1,2
,steps,"[('step_1', ...), ('step_2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('bill_length_mm', ...), ('bill_depth_mm', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('sex_island', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('MinMaxScaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function chi...0024F269D65C0>
,k,7

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [31]:
pred=pip.predict(x_test)

In [32]:
pred

array(['Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Adelie'], dtype=object)

In [33]:
from sklearn .metrics import accuracy_score

In [34]:
score=accuracy_score(y_test,pred)

In [35]:
score

0.6956521739130435

# cross validations 

In [37]:
from sklearn.model_selection import cross_val_score

In [38]:
cv= cross_val_score(pip,x_train,y_train,cv=10)

In [39]:
cv.mean()

0.6255291005291006