In [1]:
import pandas as pd
import pandas_profiling as pp

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from src.util import set_context, raw_path, comp_path, reduce_mem_usage

In [4]:
set_context("titanic")

Files in data directory:
______

titanic/
    raw/
        train.csv
        test.csv
        gender_submission.csv
______



In [5]:
tr = pd.read_csv(raw_path("train.csv"))
te = pd.read_csv(raw_path("test.csv"))
tr.shape, te.shape

((891, 12), (418, 11))

In [6]:
col_diff = set(tr.columns).difference(te.columns)
assert len(col_diff) == 1
target_col = col_diff.pop()
target_col

'Survived'

In [7]:
tr["_test"] = False
te["_test"] = True

In [8]:
df = pd.concat([tr, te], sort=True)
df.shape

(1309, 13)

In [9]:
df.sample(10).sort_values("_test")

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,_test
584,,,C,8.7125,"Paulner, Mr. Uscher",0,585,3,male,0,0.0,3411,False
252,62.0,C87,S,26.55,"Stead, Mr. William Thomas",0,253,1,male,0,0.0,113514,False
888,,,S,23.45,"Johnston, Miss. Catherine Helen ""Carrie""",2,889,3,female,1,0.0,W./C. 6607,False
482,50.0,,S,8.05,"Rouse, Mr. Richard Henry",0,483,3,male,0,0.0,A/5 3594,False
30,40.0,,C,27.7208,"Uruchurtu, Don. Manuel E",0,31,1,male,0,0.0,PC 17601,False
850,4.0,,S,31.275,"Andersson, Master. Sigvard Harald Elias",2,851,3,male,4,0.0,347082,False
275,20.0,,S,26.0,"Bryhl, Miss. Dagmar Jenny Ingeborg",0,1167,2,female,1,,236853,True
300,32.0,,S,7.775,"Olsson, Mr. Oscar Wilhelm",0,1192,3,male,0,,347079,True
293,53.0,A34,S,81.8583,"Dodge, Dr. Washington",1,1185,1,male,1,,33638,True
36,,,S,8.05,"Roth, Miss. Sarah A",0,928,3,female,0,,342712,True


In [10]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 13 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
_test          1309 non-null bool
dtypes: bool(1), float64(3), int64(4), object(5)
memory usage: 484.6 KB


In [11]:
df = reduce_mem_usage(df)

Mem. usage decreased to  0.07 Mb (42.9% reduction)


In [12]:
obj_cols = df.select_dtypes("object").columns.tolist()
for col in obj_cols:
    print(f"{col}: {df[col].nunique()} unique values")
    df[col] = df[col].astype("category")

Cabin: 186 unique values
Embarked: 3 unique values
Name: 1307 unique values
Sex: 2 unique values
Ticket: 929 unique values


In [13]:
df.describe()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived
count,1046.0,1308.0,1309.0,1309.0,1309.0,1309.0,891.0
mean,29.875,33.28125,0.385027,655.0,2.294882,0.498854,0.383789
std,14.414062,inf,0.86556,378.020061,0.837836,1.041658,0.486572
min,0.170044,0.0,0.0,1.0,1.0,0.0,0.0
25%,21.0,7.894531,0.0,328.0,2.0,0.0,0.0
50%,28.0,14.453125,0.0,655.0,3.0,0.0,0.0
75%,39.0,31.28125,0.0,982.0,3.0,1.0,1.0
max,80.0,512.5,9.0,1309.0,3.0,8.0,1.0


In [14]:
# pp.ProfileReport(df)

In [15]:
df.to_pickle(comp_path("clean.pkl"))