In [1]:
%reload_ext autoreload
%autoreload 2

# Setup

In [2]:
import logging; logging.basicConfig(level=logging.INFO)
import pandas as pd
import numpy as np
from dascripts import *


Hello from dascripts! You have imported the following functions:
- Plot: decorate, hist, scatter
- Data processing: merge, DFEncoder
Have fun with your data analysis!



# DFEncoder Testing and Examples

In [3]:
df = pd.read_csv("../inputs/titanic.csv")

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Create encoder with both ordinal and one-hot encoding
encoders = DFEncoder(
    ord_cols=["Cabin", "Embarked"], 
    ohe_cols=["Pclass", "Sex"], 
).fit(df)

print("Encoder fitted successfully!")
print(f"Ordinal encoders: {list(encoders.ord_encoders.keys())}")
print(f"One-hot encoders: {list(encoders.ohe_encoders.keys())}")

Encoder fitted successfully!
Ordinal encoders: ['Cabin', 'Embarked']
One-hot encoders: ['Pclass', 'Sex']


In [6]:
# Transform the data
df_encoded = encoders.transform(df)
print(f"Original shape: {df.shape}")
print(f"Encoded shape: {df_encoded.shape}")

Original shape: (891, 12)
Encoded shape: (891, 19)


In [7]:
# Show the transformed data
print("Encoded DataFrame:")
df_encoded.head()

Encoded DataFrame:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_old,Embarked_old,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,-1,2,,S,0,0,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,81,0,C85,C,1,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,-1,2,,S,0,0,1,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,55,2,C123,S,1,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,-1,2,,S,0,0,1,0,1


# Join

In [8]:
a = df.head()
b = df.tail()

In [9]:
a

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
b

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [11]:
merge(a, b, how="inner", left_on=["PassengerId"])

INFO:dascripts.data_processing:Left: (5, 12)
INFO:dascripts.data_processing:Right: (5, 12)
INFO:dascripts.data_processing:Left columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
INFO:dascripts.data_processing:Right columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
INFO:dascripts.data_processing:Merge operation: INNER
INFO:dascripts.data_processing:Merge suffixes: ('_left', '_right')
INFO:dascripts.data_processing:Merged dataframe: (0, 23)
INFO:dascripts.data_processing:Merged columns: ['PassengerId', 'Survived_left', 'Pclass_left', 'Name_left', 'Sex_left', 'Age_left', 'SibSp_left', 'Parch_left', 'Ticket_left', 'Fare_left', 'Cabin_left', 'Embarked_left', 'Survived_right', 'Pclass_right', 'Name_right', 'Sex_right', 'Age_right', 'SibSp_right', 'Parch_right', 'Ticket_right', 'Fare_right', 'Cabin_right', 'Embarked_right']
INFO:dascripts.data_

Unnamed: 0,PassengerId,Survived_left,Pclass_left,Name_left,Sex_left,Age_left,SibSp_left,Parch_left,Ticket_left,Fare_left,...,Pclass_right,Name_right,Sex_right,Age_right,SibSp_right,Parch_right,Ticket_right,Fare_right,Cabin_right,Embarked_right
