In [1]:

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')
# Display the first few rows
data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
ticket = data['Ticket'].copy()

In [3]:
# Basic statistics for the 'Fare' column
ticket.describe()

count        891
unique       681
top       347082
freq           7
Name: Ticket, dtype: object

In [4]:
# remove rows with Fare = 0, as it may not be useful for analysis
data = data[data['Fare'] != 0].copy()


In [5]:
data['Ticket'].value_counts()

Ticket
347082      7
1601        7
CA. 2343    7
3101295     6
CA 2144     6
           ..
2693        1
PC 17612    1
349233      1
349236      1
370376      1
Name: count, Length: 671, dtype: int64

In [6]:
test['Ticket'].value_counts()

Ticket
PC 17608    5
CA. 2343    4
113503      4
PC 17483    3
220845      3
           ..
349226      1
2621        1
4133        1
113780      1
2668        1
Name: count, Length: 363, dtype: int64

In [7]:
# Extract Ticket prefix
data['Ticket_prefix'] = (
    data['Ticket']
    .astype(str)
    .str.replace(r'\d+', '', regex=True)   # remove digits
    .str.replace('.', '', regex=False)     # remove dots
    .str.strip()                           # trim spaces
)

# Replace empty prefixes with 'NUMBER'
data['Ticket_prefix'] = data['Ticket_prefix'].replace('', 'NUMBER')

# Extract numeric part of the Ticket
data['Ticket_number'] = (
    data['Ticket']
    .astype(str)
    .str.extract(r'(\d+)$')[0]             # extract last group of digits
    .astype(float)                         # convert to numeric
)

# Optional: check results
data[['Ticket', 'Ticket_prefix', 'Ticket_number']].head(10)


Unnamed: 0,Ticket,Ticket_prefix,Ticket_number
0,A/5 21171,A/,21171.0
1,PC 17599,PC,17599.0
2,STON/O2. 3101282,STON/O,3101282.0
3,113803,NUMBER,113803.0
4,373450,NUMBER,373450.0
5,330877,NUMBER,330877.0
6,17463,NUMBER,17463.0
7,349909,NUMBER,349909.0
8,347742,NUMBER,347742.0
9,237736,NUMBER,237736.0


In [8]:
# ---- 1) CLEAN, GROUP, DISPLAY ----
keys = ['Ticket_number', 'Embarked']

# (a) A display-friendly Cabin_Grouped: show known cabins; if none, show 'Unknown'
def _agg_cabins_for_display(s: pd.Series) -> str:
    vals = [v for v in s.dropna().unique() if str(v).strip().lower() != 'unknown']
    return 'Unknown' if len(vals) == 0 else ', '.join(vals)

cabin_grouped = (
    data.groupby(keys)['Cabin']
        .agg(_agg_cabins_for_display)
        .reset_index()
        .rename(columns={'Cabin': 'Cabin_Grouped'})
)

cabin_grouped

Unnamed: 0,Ticket_number,Embarked,Cabin_Grouped
0,3.0,S,E77
1,541.0,C,D
2,693.0,S,Unknown
3,695.0,S,B51 B53 B55
4,751.0,S,Unknown
...,...,...,...
665,3101310.0,S,Unknown
666,3101311.0,S,Unknown
667,3101312.0,S,Unknown
668,3101316.0,S,Unknown


In [9]:
# ---- 1) CLEAN, GROUP, DISPLAY ----
keys = ['Ticket_number', 'Embarked']

# (a) A display-friendly Cabin_Grouped: show known cabins; if none, show 'Unknown'
def _agg_cabins_for_display(s: pd.Series) -> str:
    vals = [v for v in s.dropna().unique() if str(v).strip().lower() != 'unknown']
    return 'Unknown' if len(vals) == 0 else ', '.join(vals)

cabin_grouped = (
    data.groupby(keys)['Cabin']
        .agg(_agg_cabins_for_display)
        .reset_index()
        .rename(columns={'Cabin': 'Cabin_Grouped'})
)

cabin_grouped

Unnamed: 0,Ticket_number,Embarked,Cabin_Grouped
0,3.0,S,E77
1,541.0,C,D
2,693.0,S,Unknown
3,695.0,S,B51 B53 B55
4,751.0,S,Unknown
...,...,...,...
665,3101310.0,S,Unknown
666,3101311.0,S,Unknown
667,3101312.0,S,Unknown
668,3101316.0,S,Unknown
