# Анализ данных по Титанику

In [1]:
import numpy as np
import pandas as pd

In [3]:
# Загрузка файла с данными
titanic_data = pd.read_csv("titanic.csv")

In [4]:
titanic_data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [7]:
# Изучаем часть таблицы(первые пять строк)
titanic_data.shape

(891, 15)

In [8]:
# более подробную тех.информацию по датафрейму
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


- анализ

In [9]:
# сводные показатели
# количество выживших
titanic_data["survived"].sum()

342

In [10]:
# средний возраст пассажиров
titanic_data["age"].mean()

29.69911764705882

In [11]:
# минимальный возраст
titanic_data["age"].min()

0.42

In [12]:
# максимальный возраст
titanic_data["age"].max()

80.0

In [13]:
# значение по столбцу survived при условии "age" ==0.42
# комбинация маскирования и обычной индексации по столбцам
titanic_data.loc[titanic_data["age"] == 0.42, "survived"]

803    1
Name: survived, dtype: int64

In [14]:
# значение всех столбцов при условии "age" ==0.42
titanic_data.loc[titanic_data["age"] == 0.42, :]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
803,1,3,male,0.42,0,1,8.5167,C,Third,child,False,,Cherbourg,yes,False


In [15]:
# значение некоторых столбцов при условии "age" == 80
titanic_data.loc[titanic_data["age"] == 80, :]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True


In [16]:
titanic_data.loc[titanic_data["age"] == 80, "survived"]

630    1
Name: survived, dtype: int64

In [18]:
titanic_data.loc[titanic_data["age"] == 80, ["survived", "pclass", "sex", "embark_town", "alone"]]

Unnamed: 0,survived,pclass,sex,embark_town,alone
630,1,1,male,Southampton,True


In [19]:
# сводные показатели по столбцу "age" при условии "survived" == 1 (выживший)
titanic_data.loc[titanic_data["survived"] == 1, "age"].mean()

28.343689655172415

In [20]:
# сводные показатели по столбцу "age" при условии "survived" == 0 (не выживший)
titanic_data.loc[titanic_data["survived"] == 0, "age"].mean()

30.62617924528302

In [22]:
# минимальное значение по столбцу "age" при условии "survived" == 1 (выживший)
titanic_data.loc[titanic_data["survived"] == 0, "age"].min()

1.0

In [23]:
# максимальное значение по столбцу "age" при условии "survived" == 1 (выживший)
titanic_data.loc[titanic_data["survived"] == 0, "age"].max()

74.0

- Groupby

In [25]:
# "Сгруппировать по"
# операции "разбиения", "применение", "объединение"
titanic_data.groupby("survived").mean()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,adult_male,alone
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2.531876,30.626179,0.553734,0.32969,22.117887,0.817851,0.681239
1,1.950292,28.34369,0.473684,0.464912,48.395408,0.25731,0.476608


In [26]:
titanic_data.groupby("survived")[["pclass", "age"]].mean()

Unnamed: 0_level_0,pclass,age
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2.531876,30.626179
1,1.950292,28.34369


In [28]:
# группировка данных по столбцу
titanic_data.groupby("survived")[["pclass", "age"]].min()

Unnamed: 0_level_0,pclass,age
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,1.0
1,1,0.42


In [31]:
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [33]:
# группировка данных по категории "pclass"
titanic_data.groupby("pclass")[["age"]].mean()

Unnamed: 0_level_0,age
pclass,Unnamed: 1_level_1
1,38.233441
2,29.87763
3,25.14062


In [34]:
# группировка данных по категории "who"
titanic_data.groupby("who")[["pclass", "age"]].mean()

Unnamed: 0_level_0,pclass,age
who,Unnamed: 1_level_1,Unnamed: 2_level_1
child,2.626506,6.369518
man,2.372439,33.173123
woman,2.084871,32.0


In [35]:
# группировка данных по категориям "who","survived" 
titanic_data.groupby(["who", "survived"])[["pclass", "age"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,pclass,age
who,survived,Unnamed: 2_level_1,Unnamed: 3_level_1
child,0,2.941176,7.220588
child,1,2.408163,5.77898
man,0,2.454343,32.983871
man,1,1.954545,34.069444
woman,0,2.848485,30.459184
woman,1,1.839024,32.446746


In [37]:
titanic_data.groupby(["who", "survived", "pclass"])[["survived"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,survived
who,survived,pclass,Unnamed: 3_level_1
child,0,1,1
child,0,3,33
child,1,1,5
child,1,2,19
child,1,3,25
man,0,1,77
man,0,2,91
man,0,3,281
man,1,1,42
man,1,2,8


In [38]:
# статистика выживших в зависимости от пола
titanic_data.groupby(["sex", "survived", "pclass"])[["survived"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,survived
sex,survived,pclass,Unnamed: 3_level_1
female,0,1,3
female,0,2,6
female,0,3,72
female,1,1,91
female,1,2,70
female,1,3,72
male,0,1,77
male,0,2,91
male,0,3,300
male,1,1,45


In [39]:
# применение нескольких агригирующих функций
titanic_data.aggregate(["mean", min, np.max])

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,alive,alone
mean,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,,,0.602694,,0.602694
min,0.0,1.0,female,0.42,0.0,0.0,0.0,First,child,False,no,False
amax,1.0,3.0,male,80.0,8.0,6.0,512.3292,Third,woman,True,yes,True


In [40]:
# применение нескольких агригирующих функций
titanic_data.groupby(["who", "survived", "pclass"])[["survived", "age"]].aggregate(["mean", min, np.max, "count"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,survived,survived,survived,survived,age,age,age,age
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,min,amax,count,mean,min,amax,count
who,survived,pclass,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
child,0,1,0,0,0,1,2.0,2.0,2.0,1
child,0,3,0,0,0,33,7.378788,1.0,15.0,33
child,1,1,1,1,1,5,8.984,0.92,15.0,5
child,1,2,1,1,1,19,4.543684,0.67,14.0,19
child,1,3,1,1,1,25,6.0768,0.42,15.0,25
man,0,1,0,0,0,77,44.581967,18.0,71.0,61
man,0,2,0,0,0,91,33.369048,16.0,70.0,84
man,0,3,0,0,0,281,29.209184,16.0,74.0,196
man,1,1,1,1,1,42,38.756757,17.0,80.0,37
man,1,2,1,1,1,8,36.666667,19.0,62.0,6
