In [1]:
import pandas as pd
import seaborn as sns

## The Palmer penguins dataset

In [2]:
df = sns.load_dataset('penguins')

In [3]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [4]:
# df.info()

### Features
- `species`: a factor denoting penguin species (Adélie, Chinstrap and Gentoo)
- `island`: a factor denoting island in Palmer Archipelago, Antarctica (Biscoe, Dream or Torgersen)
- `bill_length_mm`: a number denoting bill length (millimeters)
- `bill_depth_mm`: a number denoting bill depth (millimeters)
- `flipper_length_mm`: an integer denoting flipper length (millimeters)
- `body_mass_g`: an integer denoting body mass (grams)
- `sex`: a factor denoting penguin sex (female, male)

### Ecoding

In [5]:
pd.get_dummies(df[['island','species']], prefix=['island', 'sex']) # drop_first=True to dummy encoding

Unnamed: 0,island_Biscoe,island_Dream,island_Torgersen,sex_Adelie,sex_Chinstrap,sex_Gentoo
0,0,0,1,1,0,0
1,0,0,1,1,0,0
2,0,0,1,1,0,0
3,0,0,1,1,0,0
4,0,0,1,1,0,0
...,...,...,...,...,...,...
339,1,0,0,0,0,1
340,1,0,0,0,0,1
341,1,0,0,0,0,1
342,1,0,0,0,0,1


#### 

In [6]:
# use pd.concat to join the new columns with your original dataframe
df = pd.concat([df,pd.get_dummies(df[['island','species']], prefix=['island', 'species'])], axis=1)
df = pd.concat([df,pd.get_dummies(df['sex'], prefix='sex')],axis=1)

# now drop the original 'country' column (you don't need it anymore)
df.drop(['island', 'species', 'sex'],axis=1, inplace=True)

In [7]:
df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,species_Adelie,species_Chinstrap,species_Gentoo,sex_Female,sex_Male
0,39.1,18.7,181.0,3750.0,0,0,1,1,0,0,0,1
1,39.5,17.4,186.0,3800.0,0,0,1,1,0,0,1,0
2,40.3,18.0,195.0,3250.0,0,0,1,1,0,0,1,0
3,,,,,0,0,1,1,0,0,0,0
4,36.7,19.3,193.0,3450.0,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
339,,,,,1,0,0,0,0,1,0,0
340,46.8,14.3,215.0,4850.0,1,0,0,0,0,1,1,0
341,50.4,15.7,222.0,5750.0,1,0,0,0,0,1,0,1
342,45.2,14.8,212.0,5200.0,1,0,0,0,0,1,1,0


## Bike sharing dataset 

In [8]:
df = pd.read_csv('data/bike-sharing-dataset/hour.csv')

In [9]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


### Features
Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv

- instant: record index
- dteday : date
- season : season (1:springer, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
+ weathersit : 
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
    - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered
    

### Questions
 - numerical? 
 - categorical? 
     - useless? 
     - ordinal? 
     - nominal? 
     - binary? 
     - time? 