In [76]:
# import libraries
import pandas as pd

from sklearn.ensemble import RandomForestClassifier

import pickle # to save model later

### Data exploration


In [38]:
penguins = pd.read_csv("/content/drive/MyDrive/all abt data/streamlit/000_penguinclassification/penguins_cleaned.csv")

In [39]:
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,male
1,Adelie,Torgersen,39.5,17.4,186,3800,female
2,Adelie,Torgersen,40.3,18.0,195,3250,female
3,Adelie,Torgersen,36.7,19.3,193,3450,female
4,Adelie,Torgersen,39.3,20.6,190,3650,male


In [65]:
# .shape returns (rows, columns)
print(f"Shape (Rows,Columns) -> {penguins.shape}")

print("-+"*50)

# describe the columns (variable)
print(penguins.info())

print("-+"*50)

# first 5 rows of data
penguins.head()

Shape (Rows,Columns) -> (333, 7)
-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    int64  
 5   body_mass_g        333 non-null    int64  
 6   sex                333 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 18.3+ KB
None
-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,male
1,Adelie,Torgersen,39.5,17.4,186,3800,female
2,Adelie,Torgersen,40.3,18.0,195,3250,female
3,Adelie,Torgersen,36.7,19.3,193,3450,female
4,Adelie,Torgersen,39.3,20.6,190,3650,male


#### This dataset is already cleaned for this demo

In [51]:
 # copy the dataset for backup
 df = penguins.copy()

#### Ordinal feature encoding

Need to one-hot encode categorical values like `sex`, `island`, `species`.

Encoding is just a fancy word that means you convert the strings to numbers. Like instead of `Boy` or `Girl`, setting it as `0` and `1` would be way easier!

In [52]:
# basically renames it in a way after encoding it
# pd.get_dummies(df[encode], prefix ={"sex": "sx", "island" : "il"} )

In [53]:
target = "species"

encode = ["sex", "island"]

`get_dummies` encodes the categorical columns

In [54]:
for col in encode:
  dummy = pd.get_dummies(df[col], prefix = col)
  # axis = 1 is to add columns
  df = pd.concat([df, dummy], axis = 1)
  print(df)
  # deleting the former non hot encoded columns
  del df[col]




       species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0       Adelie  Torgersen            39.1           18.7                181   
1       Adelie  Torgersen            39.5           17.4                186   
2       Adelie  Torgersen            40.3           18.0                195   
3       Adelie  Torgersen            36.7           19.3                193   
4       Adelie  Torgersen            39.3           20.6                190   
..         ...        ...             ...            ...                ...   
328  Chinstrap      Dream            55.8           19.8                207   
329  Chinstrap      Dream            43.5           18.1                202   
330  Chinstrap      Dream            49.6           18.2                193   
331  Chinstrap      Dream            50.8           19.0                210   
332  Chinstrap      Dream            50.2           18.7                198   

     body_mass_g     sex  sex_female  sex_male  
0 

In [56]:
encoded_df = df

In [58]:
encoded_df

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_female,sex_male,island_Biscoe,island_Dream,island_Torgersen
0,Adelie,39.1,18.7,181,3750,False,True,False,False,True
1,Adelie,39.5,17.4,186,3800,True,False,False,False,True
2,Adelie,40.3,18.0,195,3250,True,False,False,False,True
3,Adelie,36.7,19.3,193,3450,True,False,False,False,True
4,Adelie,39.3,20.6,190,3650,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...
328,Chinstrap,55.8,19.8,207,4000,False,True,False,True,False
329,Chinstrap,43.5,18.1,202,3400,True,False,False,True,False
330,Chinstrap,49.6,18.2,193,3775,False,True,False,True,False
331,Chinstrap,50.8,19.0,210,4100,False,True,False,True,False


`pd.apply()` allow the users to pass a function and apply it on every single value of the Pandas series.

In [59]:
# encoding target
target_mapper = {"Adelie": 0, "Chinstrap": 1, "Gentoo": 2}

# so define a function

def target_encode(val):
  # this will retrieve the value of each dictionary key in the target mapper
  return target_mapper[val]



In [63]:
# overwrites the curren species column
df['species'] = df['species'].apply(target_encode)

In [64]:
df

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_female,sex_male,island_Biscoe,island_Dream,island_Torgersen
0,0,39.1,18.7,181,3750,False,True,False,False,True
1,0,39.5,17.4,186,3800,True,False,False,False,True
2,0,40.3,18.0,195,3250,True,False,False,False,True
3,0,36.7,19.3,193,3450,True,False,False,False,True
4,0,39.3,20.6,190,3650,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...
328,1,55.8,19.8,207,4000,False,True,False,True,False
329,1,43.5,18.1,202,3400,True,False,False,True,False
330,1,49.6,18.2,193,3775,False,True,False,True,False
331,1,50.8,19.0,210,4100,False,True,False,True,False


#### Now that the data is clean and encoded, to pass it tru the model need to seperate the data

In [66]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    int64  
 1   bill_length_mm     333 non-null    float64
 2   bill_depth_mm      333 non-null    float64
 3   flipper_length_mm  333 non-null    int64  
 4   body_mass_g        333 non-null    int64  
 5   sex_female         333 non-null    bool   
 6   sex_male           333 non-null    bool   
 7   island_Biscoe      333 non-null    bool   
 8   island_Dream       333 non-null    bool   
 9   island_Torgersen   333 non-null    bool   
dtypes: bool(5), float64(2), int64(3)
memory usage: 14.8 KB
None


In [69]:
# drop the species column since we want to predict species
X = df.drop('species', axis = 1)
Y = df['species']

In [74]:
# create random forest classifier
clf = RandomForestClassifier()

# fit data
clf.fit(X,Y)

In [77]:
# saves the model
pickle.dump(clf, open('penguins_clf.pkl', 'wb'))