In [1]:
import pandas as pd

In [2]:
# Load the dataset
titanic = pd.read_csv("data/titanic.csv")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# How do I select a subset of a DataFrame?

In [3]:
ages = titanic["age"]
ages.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [4]:
type(titanic["age"])

pandas.core.series.Series

In [5]:
titanic["age"].shape

(891,)

### To select multiple columns, use a list of column names within the selection brackets [].

In [6]:
age_sex = titanic[["age", "sex"]]
age_sex.head()

Unnamed: 0,age,sex
0,22.0,male
1,38.0,female
2,26.0,female
3,35.0,female
4,35.0,male


In [7]:
type(titanic[["age", "sex"]])

pandas.core.frame.DataFrame

## Note:
### To select rows based on a conditional expression, use a condition inside the selection brackets [].

### I’m interested in the passengers older than 35 years.

In [8]:
above_35 = titanic[titanic["age"] > 35]
above_35.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True
13,0,3,male,39.0,1,5,31.275,S,Third,man,True,,Southampton,no,False
15,1,2,female,55.0,0,0,16.0,S,Second,woman,False,,Southampton,yes,True


In [9]:
titanic["age"] > 35

0      False
1       True
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: age, Length: 891, dtype: bool

In [10]:
above_35.shape

(217, 15)

## isin(list)
### I’m interested in the Titanic passengers from cabin class 2 and 3.

In [11]:
class_23 = titanic[titanic["pclass"].isin([2, 3])]
class_23.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


the isin() conditional function returns a True for each row the values are in the provided list. To filter the rows based on such a function, use the conditional function inside the selection brackets []. In this case, the condition inside the selection brackets titanic["Pclass"].isin([2, 3]) checks for which rows the Pclass column is either 2 or 3.

## anther solution for isin(list)

In [12]:
class_23 = titanic[(titanic["pclass"] == 2) | (titanic["pclass"] == 3)]
class_23.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


## notna()
### I want to work with passenger data for which the age is known.

In [13]:
age_no_na = titanic[titanic["age"].notna()]
age_no_na.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


The notna() conditional function returns a True for each row the values are not a Null value. As such, this can be combined with the selection brackets [] to filter the data table.

In [14]:
age_no_na.shape

(714, 15)

## loc
### How do I select specific rows and columns from a DataFrame?

In [15]:
adult_names = titanic.loc[titanic["age"] > 35, "sex"]
adult_names.head()

1     female
6       male
11    female
13      male
15    female
Name: sex, dtype: object

## iloc

In [16]:
titanic.iloc[9:25, 2:5]

Unnamed: 0,sex,age,sibsp
9,female,14.0,1
10,female,4.0,1
11,female,58.0,0
12,male,20.0,0
13,male,39.0,1
14,female,14.0,0
15,female,55.0,0
16,male,2.0,4
17,male,,0
18,female,31.0,1


### to assign the name anonymous to the first 3 elements of the third column:

In [17]:
titanic.iloc[0:3, 3] = "anonymous"

In [18]:
titanic.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,anonymous,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,anonymous,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,anonymous,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [19]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,anonymous,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,anonymous,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,anonymous,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
