# A simple ML Workflow

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier

### Get Data

In [3]:
df = pd.read_csv('penguins.csv')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female


In [8]:
df.shape

(344, 7)

### Train-Test-Split

In [6]:
train, test = train_test_split(df, test_size=0.2, random_state=42)  # <-- random seed value

In [7]:
train.shape, test.shape

((275, 7), (69, 7))

### Exploratory Analysis

In [9]:
# you do this for the Titanic data yourself

### Feature Engineering

In [10]:
train.fillna(0.0, inplace=True) # little crutch to make the data work

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


### Build a model

In [16]:
# predict the species of a penguin from its beak length
X = train[['bill_length_mm', 'bill_depth_mm']] # input features or independent variables
y = train['species']              # target variable or dependent variable

In [17]:
X.shape, y.shape  # X is a matrix, y is a vector

((275, 2), (275,))

In [18]:
# train a dummy classifier as a baseline model
from sklearn.dummy import DummyClassifier

In [26]:
model = DummyClassifier(strategy='most_frequent')
model.fit(X, y)   # trains the model
model.score(X, y) # calculates an accuracy (proportion of correct predictions)

0.43636363636363634

In [21]:
# model will alway predict 'Adelie'
y.value_counts()

Adelie       120
Gentoo       100
Chinstrap     55
Name: species, dtype: int64

#### Accuracy

number of correct predictions / number of total predictions


In [24]:
round(120 / (120 + 100 + 55), 3)

0.436

### Evaluate the model using the test data

In [27]:
# feature engineer the test data
test.fillna(0.0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [28]:
# use the model to calculate a score
Xtest = test[['bill_length_mm', 'bill_depth_mm']] # input features or independent variables
ytest = test['species'] 

model.score(Xtest, ytest) # indepent estimate of the error rate (accuracy)

0.463768115942029

### Make Predictions

In [31]:
new = pd.DataFrame([[1000000, 20]])
new

Unnamed: 0,0,1
0,1000000,20


In [32]:
model.predict(new) # <-- use this for the Kaggle submission

array(['Adelie'], dtype='<U6')