In [1]:
import pandas as pd
import pandas_profiling as pp

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from src.util import set_context, raw_path, comp_path, reduce_mem_usage

In [4]:
set_context("titanic")

Files in data directory:
______

titanic/
    clean.pkl
    raw/
        gender_submission.csv
        test.csv
        train.csv
______



In [5]:
tr = pd.read_csv(raw_path("train.csv"))
te = pd.read_csv(raw_path("test.csv"))
tr.shape, te.shape

((891, 12), (418, 11))

In [6]:
col_diff = set(tr.columns).difference(te.columns)
assert len(col_diff) == 1
target_col = col_diff.pop()
target_col

'Survived'

In [7]:
tr["_test"] = False
te["_test"] = True

In [8]:
df = pd.concat([tr, te], sort=True)
df.shape

(1309, 13)

In [9]:
df.sample(10).sort_values("_test")

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,_test
642,2.0,,S,27.9,"Skoog, Miss. Margit Elizabeth",2,643,3,female,3,0.0,347088,False
30,40.0,,C,27.7208,"Uruchurtu, Don. Manuel E",0,31,1,male,0,0.0,PC 17601,False
433,17.0,,S,7.125,"Kallio, Mr. Nikolai Erland",0,434,3,male,0,0.0,STON/O 2. 3101274,False
479,2.0,,S,12.2875,"Hirvonen, Miss. Hildur E",1,480,3,female,0,1.0,3101298,False
437,24.0,,S,18.75,"Richards, Mrs. Sidney (Emily Hocking)",3,438,2,female,2,1.0,29106,False
570,62.0,,S,10.5,"Harris, Mr. George",0,571,2,male,0,1.0,S.W./PP 752,False
667,,,S,7.775,"Rommetvedt, Mr. Knud Paust",0,668,3,male,0,0.0,312993,False
154,13.0,,S,31.3875,"Asplund, Master. Filip Oscar",2,1046,3,male,4,,347077,True
362,31.0,,S,21.0,"Ware, Mrs. John James (Florence Louise Long)",0,1254,2,female,0,,CA 31352,True
193,61.0,,Q,12.35,"Lingane, Mr. John",0,1085,2,male,0,,235509,True


In [10]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 13 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
_test          1309 non-null bool
dtypes: bool(1), float64(3), int64(4), object(5)
memory usage: 479.5 KB


In [11]:
df = reduce_mem_usage(df)

Mem. usage decreased to  0.07 Mb (42.9% reduction)


In [12]:
obj_cols = df.select_dtypes("object").columns.tolist()
for col in obj_cols:
    print(f"{col}: {df[col].nunique()} unique values")
    df[col] = df[col].astype("category")

Cabin: 186 unique values
Embarked: 3 unique values
Name: 1307 unique values
Sex: 2 unique values
Ticket: 929 unique values


In [13]:
df.describe()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived
count,1046.0,1308.0,1309.0,1309.0,1309.0,1309.0,891.0
mean,29.875,33.28125,0.385027,655.0,2.294882,0.498854,0.383789
std,14.414062,inf,0.86556,378.020061,0.837836,1.041658,0.486572
min,0.170044,0.0,0.0,1.0,1.0,0.0,0.0
25%,21.0,7.894531,0.0,328.0,2.0,0.0,0.0
50%,28.0,14.453125,0.0,655.0,3.0,0.0,0.0
75%,39.0,31.28125,0.0,982.0,3.0,1.0,1.0
max,80.0,512.5,9.0,1309.0,3.0,8.0,1.0


In [14]:
# pp.ProfileReport(df)

In [15]:
df.to_pickle(comp_path("clean.pkl"))