In [2]:
import pandas as pd
import numpy as np

# считываем файл train.csv
data = pd.read_csv('/content/train.csv')

In [3]:
# Основная информация о данных
print(data.info())

# Число пропусков
print(data.isnull().sum())

# Основные статистики
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int6

In [4]:
# Процент выживаемости по классам
survival_rate_by_class = data.groupby('Pclass')['Survived'].mean() * 100
print(survival_rate_by_class)

Pclass
1    62.962963
2    47.282609
3    24.236253
Name: Survived, dtype: float64


In [7]:
# Извлечение имени
data['FirstName'] = data['Name'].str.extract(r',\s(.*?)\.')
data['FirstName'] = data['Name'].str.extract(r'\((.*?)\)|,\s(?:\w+)\s+(\w+)').bfill(axis=1)[0]

# Самые популярные мужское и женское имена
popular_male_name = data[data['Sex'] == 'male']['FirstName'].mode()[0]
popular_female_name = data[data['Sex'] == 'female']['FirstName'].mode()[0]

print(f"Самое популярное мужское имя: {popular_male_name}")
print(f"Самое популярное женское имя: {popular_female_name}")

Самое популярное мужское имя: "George Arthur Brayton"
Самое популярное женское имя: "Mary"


In [8]:
# Самые популярные имена по классам
popular_names_by_class = data.groupby(['Pclass', 'Sex'])['FirstName'].agg(lambda x: x.mode()[0])
print(popular_names_by_class)

Pclass  Sex   
1       female          "Mrs de Villiers"
        male      "George Arthur Brayton"
2       female              "Mrs Harbeck"
        male            "Louis M Hoffman"
3       female                     "Mary"
        male                      "Henry"
Name: FirstName, dtype: object


In [9]:
# Пассажиры старше 44 лет
older_than_44 = data[data['Age'] > 44]
print(older_than_44)

     PassengerId  Survived  Pclass  \
6              7         0       1   
11            12         1       1   
15            16         1       2   
33            34         0       2   
52            53         1       1   
..           ...       ...     ...   
857          858         1       1   
862          863         1       1   
871          872         1       1   
873          874         0       3   
879          880         1       1   

                                                  Name     Sex   Age  SibSp  \
6                              McCarthy, Mr. Timothy J    male  54.0      0   
11                            Bonnell, Miss. Elizabeth  female  58.0      0   
15                    Hewlett, Mrs. (Mary D Kingcome)   female  55.0      0   
33                               Wheadon, Mr. Edward H    male  66.0      0   
52            Harper, Mrs. Henry Sleeper (Myna Haxtun)  female  49.0      1   
..                                                 ...     ...   ... 

In [10]:
# Мужчины младше 44 лет
males_younger_than_44 = data[(data['Age'] < 44) & (data['Sex'] == 'male')]
print(males_younger_than_44)

     PassengerId  Survived  Pclass                            Name   Sex  \
0              1         0       3         Braund, Mr. Owen Harris  male   
4              5         0       3        Allen, Mr. William Henry  male   
7              8         0       3  Palsson, Master. Gosta Leonard  male   
12            13         0       3  Saundercock, Mr. William Henry  male   
13            14         0       3     Andersson, Mr. Anders Johan  male   
..           ...       ...     ...                             ...   ...   
883          884         0       2   Banfield, Mr. Frederick James  male   
884          885         0       3          Sutehall, Mr. Henry Jr  male   
886          887         0       2           Montvila, Rev. Juozas  male   
889          890         1       1           Behr, Mr. Karl Howell  male   
890          891         0       3             Dooley, Mr. Patrick  male   

      Age  SibSp  Parch            Ticket    Fare Cabin Embarked FirstName  
0    22.0 

In [11]:
# Число пассажиров в каютах
data['CabinGroup'] = data['Cabin'].str.split(' ').str.len().fillna(0).astype(int)
cabin_counts = data['CabinGroup'].value_counts()
print(cabin_counts)

CabinGroup
0    687
1    180
2     16
3      6
4      2
Name: count, dtype: int64
