In [34]:
import pandas as pd
import numpy as np
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [85]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler


def clean_data(data, transformer=None):
    # Take only helpful data
   
    data_hlpfl = data.drop(columns=['PassengerId', 'Name', 'Ticket'])
    
    if 'Survived' in data.columns:
        y = data_hlpfl['Survived']
        X = data_hlpfl.drop('Survived', axis=1)
    else:
        y = None
        X = data_hlpfl
    
    numeric_features = X.select_dtypes(include=[np.number]).columns
    categorical_features = X.select_dtypes(include=[object]).columns
    
    numeric_transformer = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="median")),
        ("standardize", StandardScaler()),
        ]
    ) 

    categorical_transformer = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]
    )
    if transformer == None:
        transformer = ColumnTransformer(
            transformers=[
                (
                    'num',numeric_transformer, numeric_features
                ),
                (
                    'cat',categorical_transformer, categorical_features
                )
            ]
        )
        X_processed = transformer.fit_transform(X)
    else:    
        X_processed = transformer.transform(X)
    
    feature_names = (numeric_features.tolist() + transformer.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out().tolist())
        
    X_processed_df = pd.DataFrame( X_processed.toarray(), columns=feature_names)
    
    return X_processed_df, y, transformer
    

X_train, y_train, transformer = clean_data(train_data)
X_test, y_test, transformer = clean_data(test_data, transformer=transformer)

print(X_train.info())
print('--------------------------------')
print(X_test.info())


from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0)

model.fit(X_train, y_train)

prediction = model.predict(X_test)

combined_df = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': prediction})

combined_df.to_csv("predictions.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 159 entries, Pclass to x2_missing
dtypes: float64(159)
memory usage: 1.1 MB
None
--------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Columns: 159 entries, Pclass to x2_missing
dtypes: float64(159)
memory usage: 519.4 KB
None
