# What is a classification problem?

Classification problems are an example of **supervised learning**.  In a classification problem, the input data is presented to the machine learning model, and the task is to **predict the target** corresponding to the input data.  The **target** is a categorical variable, so the classification task is to predict the category or label of the target given the input data.  


For example, we could predict the weather (*sunny, windy, rainy, cloudy*) given input data such as temperature, relative humidity, atmospheric pressure, wind speed, wind direction, etc. Notice that the label of the **target** is a categorical variable (*sunny, windy, rainy, cloudy*).

In [1]:
import numpy as np
import pandas as pd

In [2]:
path = "https://raw.githubusercontent.com/mojuan010-hithub/decision_tree_penguins_size/refs/heads/main/penguins.csv"
df=pd.read_csv(path)
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [3]:
df.isnull().sum()

Unnamed: 0,0
species,0
island,0
bill_length_mm,2
bill_depth_mm,2
flipper_length_mm,2
body_mass_g,2
sex,11
year,0


In [4]:
df.dropna(inplace=True)

In [5]:
df.isnull().sum()

Unnamed: 0,0
species,0
island,0
bill_length_mm,0
bill_depth_mm,0
flipper_length_mm,0
body_mass_g,0
sex,0
year,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
 7   year               333 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 23.4+ KB


In [7]:
df["species"].value_counts()

Unnamed: 0_level_0,count
species,Unnamed: 1_level_1
Adelie,146
Gentoo,119
Chinstrap,68


In [8]:
df["island"].value_counts()

Unnamed: 0_level_0,count
island,Unnamed: 1_level_1
Biscoe,163
Dream,123
Torgersen,47


In [9]:
df["sex"].value_counts()

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
male,168
female,165


In [10]:
# using labelencoder to encode target column (categorial)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["species"]= le.fit_transform(df["species"])
df["species"]
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,0,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,0,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,0,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,0,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,0,Torgersen,39.3,20.6,190.0,3650.0,male,2007
...,...,...,...,...,...,...,...,...
339,1,Dream,55.8,19.8,207.0,4000.0,male,2009
340,1,Dream,43.5,18.1,202.0,3400.0,female,2009
341,1,Dream,49.6,18.2,193.0,3775.0,male,2009
342,1,Dream,50.8,19.0,210.0,4100.0,male,2009


In [11]:
# using onehotencoder to encode features' categorial columns
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
sex_island_encoder = ohe.fit_transform(df[["sex","island"]]).toarray()
df_sex_island_encoder = pd.DataFrame(data=sex_island_encoder,
                                     index=df.index,
                                     columns=[f"sex_{c}" for c in ohe.categories_[0]]+[f"island_{c}" for c in ohe.categories_[1]])
df = pd.concat([df,df_sex_island_encoder],axis=1)
df = df.drop(columns=["island","sex"])
df.head(5)

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,sex_female,sex_male,island_Biscoe,island_Dream,island_Torgersen
0,0,39.1,18.7,181.0,3750.0,2007,0.0,1.0,0.0,0.0,1.0
1,0,39.5,17.4,186.0,3800.0,2007,1.0,0.0,0.0,0.0,1.0
2,0,40.3,18.0,195.0,3250.0,2007,1.0,0.0,0.0,0.0,1.0
4,0,36.7,19.3,193.0,3450.0,2007,1.0,0.0,0.0,0.0,1.0
5,0,39.3,20.6,190.0,3650.0,2007,0.0,1.0,0.0,0.0,1.0


In [12]:
y=df["species"]
y.head(5)

Unnamed: 0,species
0,0
1,0
2,0
4,0
5,0


In [13]:
X=df.drop(columns="species")
X.head(5)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,sex_female,sex_male,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,2007,0.0,1.0,0.0,0.0,1.0
1,39.5,17.4,186.0,3800.0,2007,1.0,0.0,0.0,0.0,1.0
2,40.3,18.0,195.0,3250.0,2007,1.0,0.0,0.0,0.0,1.0
4,36.7,19.3,193.0,3450.0,2007,1.0,0.0,0.0,0.0,1.0
5,39.3,20.6,190.0,3650.0,2007,0.0,1.0,0.0,0.0,1.0


In [14]:
from sklearn.tree import DecisionTreeClassifier
dtc_model = DecisionTreeClassifier(random_state=0)
dtc_model

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)
print("X_train shape:",X_train.shape)
print("y_train shape:",y_train.shape)
print("X_test shape:",X_test.shape)
print("y_test shape:",y_test.shape)

X_train shape: (249, 10)
y_train shape: (249,)
X_test shape: (84, 10)
y_test shape: (84,)


In [54]:
from sklearn.model_selection import GridSearchCV
param_grid = {"max_depth":[None,1,5,10,20,30,50,100],
              "min_samples_leaf" : [1,2,5,10,20,30,50,100]
              }
grid = GridSearchCV(estimator= dtc_model,
             param_grid = param_grid,
             cv = 5,scoring = "accuracy")
grid.fit(X_train,y_train)

In [55]:
best_model = grid.best_estimator_
best_model

In [56]:
grid.best_params_

{'max_depth': None, 'min_samples_leaf': 1}

In [57]:
training_accuracy_score = grid.best_score_
training_accuracy_score

np.float64(0.9679183673469387)

In [58]:
y_test_prediction = best_model.predict(X_test)
y_test_prediction

array([0, 1, 0, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 0, 0, 1, 0, 1, 0, 2, 0,
       1, 0, 1, 2, 0, 2, 0, 1, 0, 0, 0, 2, 1, 1, 0, 1, 1, 2, 2, 1, 2, 2,
       0, 2, 1, 1, 0, 2, 0, 1, 2, 2, 2, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0,
       1, 0, 2, 0, 0, 2, 1, 0, 0, 2, 2, 1, 0, 1, 1, 0, 0, 2])

In [59]:
from sklearn.metrics import accuracy_score
test_accuracy_score = accuracy_score(y_test,y_test_prediction)
test_accuracy_score

0.9761904761904762