# Import library

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import dataset

In [17]:
df = pd.read_csv('titanic.csv')
df.head(5)

Unnamed: 0,Cabin,Ticket,number,Survived
0,,A/5 21171,5,0
1,C85,PC 17599,3,1
2,,STON/O2. 3101282,6,1
3,C123,113803,3,1
4,,373450,A,0


In [18]:
df['number'].unique()

array(['5', '3', '6', 'A', '2', '1', '4'], dtype=object)

 # The goal is to separate numbers and letters: [`5`, `3`, `6`, `A`, `2`]

## Step 1(Construct Numaric Column): to_numeric()
```python
pandas.to_numeric(arg, errors='raise', downcast=None, dtype_backend=<no_default>)
```
> errors{‘ignore’, ‘raise’, ‘coerce’}, default ‘raise’
> - If ‘raise’, then invalid parsing will raise an exception.
> - If ‘coerce’, then invalid parsing will be set as NaN.
> - If ‘ignore’, then invalid parsing will return the input.

> downcaststr, default None
> - ‘integer’ or ‘signed’: smallest signed int dtype (min.: np.int8)
> - ‘unsigned’: smallest unsigned int dtype (min.: np.uint8)
> - ‘float’: smallest float dtype (min.: np.float32)

In [21]:
df['new_number'] = pd.to_numeric(df['number'], errors='coerce', downcast="integer")
df.head(5)

Unnamed: 0,Cabin,Ticket,number,Survived,new_number
0,,A/5 21171,5,0,5.0
1,C85,PC 17599,3,1,3.0
2,,STON/O2. 3101282,6,1,6.0
3,C123,113803,3,1,3.0
4,,373450,A,0,


## Step 2 (Construct Categorical Column):

```python
numpy.where(condition, [x, y, ]/)
```
```
Parameters:
  condition: array_like, bool
    Where True, yield x, otherwise yield y.

  x, y: array_like
    Values from which to choose. x, y and
    condition need to be broadcastable to some shape.
```

In [24]:
df['category'] = np.where(df['new_number'].isnull(), df['number'], np.nan)
df.head(5)

Unnamed: 0,Cabin,Ticket,number,Survived,new_number,category
0,,A/5 21171,5,0,5.0,
1,C85,PC 17599,3,1,3.0,
2,,STON/O2. 3101282,6,1,6.0,
3,C123,113803,3,1,3.0,
4,,373450,A,0,,A


# 2. If the goal is to extract numerical parts from strings:
```python
import pandas as pd

data = ['A75', 'B56', 'C89']
df = pd.DataFrame({'mixed_data': data})

df['numbers'] = df['mixed_data'].str.extract(r'(\d+)')  # Extract digits
df['numbers'] = pd.to_numeric(df['numbers'])
```

## Apply on Cabin

In [37]:
df['cabin_number'] = df['Cabin'].str.extract(r'(\d+)') #Extract Numaric Part
df['cabin_number'] = pd.to_numeric(df['cabin_number'], downcast='integer') # Convert numaric part to numaric value

df['cabin_cat'] = df['Cabin'].str[0] # Extract Letter Part

df.head()

Unnamed: 0,Cabin,Ticket,number,Survived,new_number,category,cabin_number,cabin_cat
0,,A/5 21171,5,0,5.0,,,
1,C85,PC 17599,3,1,3.0,,85.0,C
2,,STON/O2. 3101282,6,1,6.0,,,
3,C123,113803,3,1,3.0,,123.0,C
4,,373450,A,0,,A,,


In [25]:
df.head()

Unnamed: 0,Cabin,Ticket,number,Survived,new_number,category
0,,A/5 21171,5,0,5.0,
1,C85,PC 17599,3,1,3.0,
2,,STON/O2. 3101282,6,1,6.0,
3,C123,113803,3,1,3.0,
4,,373450,A,0,,A


## Apply on `Ticket`
When i have `space` separated data
> using `split()`

In [41]:
df['ticket_num'] = df['Ticket'].apply(lambda x: x.split()[-1]) #-1 for last splited data
df['ticket_num'] = pd.to_numeric(df['ticket_num'], errors='coerce', downcast='integer') # Convert numaric part to numaric value

df['ticket_cat'] = df['Ticket'].apply(lambda x: x.split()[0]) #-1 for last splited data
df['ticket_cat'] = np.where(df['ticket_cat'].str.isdigit(), np.nan, df['ticket_cat'])

df.head(5)

Unnamed: 0,Cabin,Ticket,number,Survived,new_number,category,cabin_number,cabin_cat,ticket_num,ticket_cat
0,,A/5 21171,5,0,5.0,,,,21171.0,A/5
1,C85,PC 17599,3,1,3.0,,85.0,C,17599.0,PC
2,,STON/O2. 3101282,6,1,6.0,,,,3101282.0,STON/O2.
3,C123,113803,3,1,3.0,,123.0,C,113803.0,
4,,373450,A,0,,A,,,373450.0,
