In [1]:
import pandas as pd
import pandas_profiling as pp

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from src.util import set_context, raw_path, comp_path

In [4]:
set_context("titanic")

Files in data directory:
______

titanic/
    clean.pkl
    raw/
        gender_submission.csv
        test.csv
        train.csv
______



In [5]:
tr = pd.read_csv(raw_path("train.csv"))
te = pd.read_csv(raw_path("test.csv"))
tr.shape, te.shape

((891, 12), (418, 11))

In [6]:
col_diff = set(tr.columns).difference(te.columns)
assert len(col_diff) == 1
target_col = col_diff.pop()
target_col

'Survived'

In [7]:
tr["_test"] = False
te["_test"] = True

In [8]:
df = pd.concat([tr, te], sort=True)
df.shape

(1309, 13)

In [9]:
df.sample(10).sort_values("_test")

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,_test
888,,,S,23.45,"Johnston, Miss. Catherine Helen ""Carrie""",2,889,3,female,1,0.0,W./C. 6607,False
220,16.0,,S,8.05,"Sunderland, Mr. Victor Francis",0,221,3,male,0,1.0,SOTON/OQ 392089,False
20,35.0,,S,26.0,"Fynney, Mr. Joseph J",0,21,2,male,0,0.0,239865,False
207,26.0,,C,18.7875,"Albimona, Mr. Nassef Cassem",0,208,3,male,0,1.0,2699,False
204,18.0,,S,8.05,"Cohen, Mr. Gurshon ""Gus""",0,205,3,male,0,1.0,A/5 3540,False
589,,,S,8.05,"Murdlin, Mr. Joseph",0,590,3,male,0,0.0,A./5. 3235,False
265,,,S,7.8958,"Lyntakoff, Mr. Stanko",0,1157,3,male,0,,349235,True
163,,,S,7.0,"Pearce, Mr. Ernest",0,1055,3,male,0,,343271,True
235,20.0,,S,7.8542,"Vendel, Mr. Olof Edvin",0,1127,3,male,0,,350416,True
97,29.0,,S,7.925,"Makinen, Mr. Kalle Edvard",0,989,3,male,0,,STON/O 2. 3101268,True


In [10]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 13 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
_test          1309 non-null bool
dtypes: bool(1), float64(3), int64(4), object(5)
memory usage: 479.5 KB


In [11]:
obj_cols = df.select_dtypes("object").columns.tolist()
for col in obj_cols:
    print(f"{col}: {df[col].nunique()} unique values")
    df[col] = df[col].astype("category")

Cabin: 186 unique values
Embarked: 3 unique values
Name: 1307 unique values
Sex: 2 unique values
Ticket: 929 unique values


In [12]:
df.describe()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived
count,1046.0,1308.0,1309.0,1309.0,1309.0,1309.0,891.0
mean,29.881138,33.295479,0.385027,655.0,2.294882,0.498854,0.383838
std,14.413493,51.758668,0.86556,378.020061,0.837836,1.041658,0.486592
min,0.17,0.0,0.0,1.0,1.0,0.0,0.0
25%,21.0,7.8958,0.0,328.0,2.0,0.0,0.0
50%,28.0,14.4542,0.0,655.0,3.0,0.0,0.0
75%,39.0,31.275,0.0,982.0,3.0,1.0,1.0
max,80.0,512.3292,9.0,1309.0,3.0,8.0,1.0


In [13]:
# pp.ProfileReport(df)

In [14]:
df.to_pickle(comp_path("clean.pkl"))