In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("../datasets/train.csv")

In [5]:
df.shape

(891, 12)

In [7]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [13]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [15]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## column description
- PassengerId-Unique identifier for each passenger.
- **Pclass**	-Passenger class (1st, 2nd, or 3rd). Lower numbers indicate higher class.
- **Name**	-	Full name of the passenger.
- **Sex**	-	Gender of the passenger (male or female).
- **Age**	-	Age of the passenger (in years). Some values might be missing.
- **SibSp**	-	Number of siblings/spouses aboard the Titanic.
- **Parch** -	Number of parents/children aboard the Titanic.
- **Ticket**	-	Ticket number assigned to the passenger. Some may share ticket numbers (families or groups).
- **Fare**	-	Price of the ticket in British pounds (£).
- **Cabin**	-	Cabin number where the passenger stayed. Many values are missing.
- **Embarked**	-	Port of embarkation (where the passenger boarded the Titanic):
C = Cherbourg, Q = Queenstown, S = Southampton.

In [22]:
#rename column names in understandable form
df.rename(columns={'Sex':'Gender'},inplace=True)

In [24]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Gender', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [26]:
#feature creation
#SibSp+Parch+1(for passenger)=FamilySize

df["FamilySize"]=df["SibSp"]+df["Parch"]+1

In [28]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Gender', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize'],
      dtype='object')

**pd.cut() splits a continuous variable (like Fare, which has many different values) into fixed ranges (bins).**

In [30]:
df["FareCategory"] = pd.cut(df["Fare"], bins=[0, 10, 50, 100, 600], labels=["Low", "Medium", "High", "Very High"])

Bin Range (Fare)	Category
0 – 10	            "Low"
10 – 50	           "Medium"
50 – 100	        "High"
100 – 600	       "Very High"

In [32]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Gender', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize',
       'FareCategory'],
      dtype='object')

In [34]:
df["FareCategory"]

0         Low
1        High
2         Low
3        High
4         Low
        ...  
886    Medium
887    Medium
888    Medium
889    Medium
890       Low
Name: FareCategory, Length: 891, dtype: category
Categories (4, object): ['Low' < 'Medium' < 'High' < 'Very High']

In [37]:
df.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Gender            0
Age             177
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin           687
Embarked          2
FamilySize        0
FareCategory     15
dtype: int64

In [39]:
df.dtypes

PassengerId        int64
Survived           int64
Pclass             int64
Name              object
Gender            object
Age              float64
SibSp              int64
Parch              int64
Ticket            object
Fare             float64
Cabin             object
Embarked          object
FamilySize         int64
FareCategory    category
dtype: object

In [41]:
df["Cabin"].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [45]:
df.fillna({"Cabin":"unknown"}, inplace=True)

In [47]:
df.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Gender            0
Age             177
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin             0
Embarked          2
FamilySize        0
FareCategory     15
dtype: int64

In [49]:
df["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [51]:
df.dropna(subset=["Embarked"], inplace=True)

In [53]:
df.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Gender            0
Age             177
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin             0
Embarked          0
FamilySize        0
FareCategory     15
dtype: int64

In [59]:
df.fillna({"Age":df["Age"].mean()}, inplace=True)

In [61]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Gender           0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
FamilySize       0
FareCategory    15
dtype: int64

In [67]:
df["FareCategory"] = df["FareCategory"].astype(str)
df.fillna({"FareCategory":"Unknown"}, inplace=True)

In [69]:
df.isnull().sum()

PassengerId     0
Survived        0
Pclass          0
Name            0
Gender          0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Cabin           0
Embarked        0
FamilySize      0
FareCategory    0
dtype: int64

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   889 non-null    int64  
 1   Survived      889 non-null    int64  
 2   Pclass        889 non-null    int64  
 3   Name          889 non-null    object 
 4   Gender        889 non-null    object 
 5   Age           889 non-null    float64
 6   SibSp         889 non-null    int64  
 7   Parch         889 non-null    int64  
 8   Ticket        889 non-null    object 
 9   Fare          889 non-null    float64
 10  Cabin         889 non-null    object 
 11  Embarked      889 non-null    object 
 12  FamilySize    889 non-null    int64  
 13  FareCategory  889 non-null    object 
dtypes: float64(2), int64(6), object(6)
memory usage: 104.2+ KB
