In [279]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score

In [280]:
df=pd.read_csv('penguins.csv')

In [281]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [282]:
df.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [283]:
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [284]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [285]:
df=df.dropna(thresh=len(df.columns)-3)

In [286]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 342 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            342 non-null    object 
 1   island             342 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 21.4+ KB


In [287]:
df.fillna({'bill_length_mm': df['bill_length_mm'].median(), 'Another Column': 0}, inplace=True)

In [288]:
df.fillna({'bill_depth_mm': df['bill_depth_mm'].median(), 'Another Column': 0}, inplace=True)

In [289]:
df.fillna({'flipper_length_mm': df['flipper_length_mm'].median(), 'Another Column': 0}, inplace=True)

In [290]:
df.fillna({'body_mass_g': df['body_mass_g'].median(), 'Another Column': 0}, inplace=True)

In [291]:
sex_mode= df['sex'].mode()[0]
df['sex'].fillna(sex_mode, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sex'].fillna(sex_mode, inplace=True)


In [292]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


In [293]:
df['species'].nunique()

3

In [294]:
df['island'].nunique()

3

In [295]:
df['sex'].nunique()

2

In [296]:
label_encoder = LabelEncoder()
columns_to_encode = ['species', 'island', 'sex']
for col in columns_to_encode:
    df[col] = label_encoder.fit_transform(df[col])

In [297]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0
4,0,2,36.7,19.3,193.0,3450.0,0
5,0,2,39.3,20.6,190.0,3650.0,1


In [298]:
x=df.drop(columns= ['species'])
y=df['species']

In [299]:
print(x)
print(y)
print(x.shape)
print(y.shape)

     island  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0         2            39.1           18.7              181.0       3750.0   
1         2            39.5           17.4              186.0       3800.0   
2         2            40.3           18.0              195.0       3250.0   
4         2            36.7           19.3              193.0       3450.0   
5         2            39.3           20.6              190.0       3650.0   
..      ...             ...            ...                ...          ...   
338       0            47.2           13.7              214.0       4925.0   
340       0            46.8           14.3              215.0       4850.0   
341       0            50.4           15.7              222.0       5750.0   
342       0            45.2           14.8              212.0       5200.0   
343       0            49.9           16.1              213.0       5400.0   

     sex  
0      1  
1      0  
2      0  
4      0  
5      1

In [300]:
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.3, random_state=42)

Decision Tree

In [302]:
from sklearn.tree import DecisionTreeClassifier
des_tree = DecisionTreeClassifier(random_state=42)
des_tree.fit(x_train, y_train)

In [303]:
tree_pred = des_tree.predict(x_test)
tree_accuracy = accuracy_score(y_test, tree_pred)
print(f'Accuracy Score: {tree_accuracy:.4f}')
tree_report= classification_report(y_test, tree_pred)
tree_conf_matrix= confusion_matrix(y_test, tree_pred)
print(f'report: {tree_report}')
print(f'conf_matrix: {tree_conf_matrix}')

Accuracy Score: 0.9515
report:               precision    recall  f1-score   support

           0       1.00      0.90      0.95        50
           1       0.95      1.00      0.97        18
           2       0.90      1.00      0.95        35

    accuracy                           0.95       103
   macro avg       0.95      0.97      0.96       103
weighted avg       0.96      0.95      0.95       103

conf_matrix: [[45  1  4]
 [ 0 18  0]
 [ 0  0 35]]


random forest

In [305]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
random_forest.fit(x_train, y_train)

In [306]:
ran_pred = random_forest.predict(x_test)
ran_accuracy = accuracy_score(y_test, ran_pred)
ran_report= classification_report(y_test, ran_pred)
ran_conf_matrix= confusion_matrix(y_test, ran_pred)
print(f'Accuracy Score: {ran_accuracy:.4f}')
print(f'report: {ran_report}')
print(f'conf_matrix: {ran_conf_matrix}')

Accuracy Score: 0.9709
report:               precision    recall  f1-score   support

           0       1.00      0.94      0.97        50
           1       0.86      1.00      0.92        18
           2       1.00      1.00      1.00        35

    accuracy                           0.97       103
   macro avg       0.95      0.98      0.96       103
weighted avg       0.98      0.97      0.97       103

conf_matrix: [[47  3  0]
 [ 0 18  0]
 [ 0  0 35]]


In [307]:
from sklearn.svm import SVC
svc_model=SVC(kernel='linear')
svc_model.fit(x_train,y_train)

In [308]:
svc_pred=svc_model.predict(x_test)
svc_accuracy=accuracy_score(y_test, svc_pred)
svc_report= classification_report(y_test, svc_pred)
svc_conf_matrix= confusion_matrix(y_test, svc_pred)
print(f'Accuracy Score: {svc_accuracy:.4f}')
print(f'report: {ran_report}')
print(f'conf_matrix: {ran_conf_matrix}')


Accuracy Score: 0.9903
report:               precision    recall  f1-score   support

           0       1.00      0.94      0.97        50
           1       0.86      1.00      0.92        18
           2       1.00      1.00      1.00        35

    accuracy                           0.97       103
   macro avg       0.95      0.98      0.96       103
weighted avg       0.98      0.97      0.97       103

conf_matrix: [[47  3  0]
 [ 0 18  0]
 [ 0  0 35]]


In [309]:
from sklearn.naive_bayes import GaussianNB
Bayes_model = GaussianNB()
Bayes_model.fit(x_train, y_train)

In [310]:
bays_pred = Bayes_model.predict(x_test)

bays_accuracy = accuracy_score(y_test, bays_pred)
print(f'Accuracy Score: {bays_accuracy:.4f}')
print(classification_report(y_test, bays_pred))
print(confusion_matrix(y_test, bays_pred))

Accuracy Score: 0.9612
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        50
           1       0.82      1.00      0.90        18
           2       1.00      1.00      1.00        35

    accuracy                           0.96       103
   macro avg       0.94      0.97      0.95       103
weighted avg       0.97      0.96      0.96       103

[[46  4  0]
 [ 0 18  0]
 [ 0  0 35]]
