In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.metrics import mean_squared_error
warnings.filterwarnings('ignore')

### Leer dataset

data = pd.read_csv('Titanic.csv')

data



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [67]:
# Separate features (X) and target variable (y)
data.dropna(inplace=True)
X = data.drop(['PassengerId', 'Pclass', 'Name', 'Survived', 'Fare','SibSp', 'Parch', 'Ticket', 'Cabin','Embarked' ], axis=1)
y = data['Survived']

In [68]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 183 entries, 1 to 889
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Sex     183 non-null    object 
 1   Age     183 non-null    float64
dtypes: float64(1), object(1)
memory usage: 4.3+ KB


In [69]:
### Split de los datos

from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Enfoque con ONE-HOT ENCODER

In [70]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define the columns transformer for one-hot encoding
categorical_cols = ['Sex'] # is a list containing the names of columns that are categorical and need one-hot encoding.
numeric_cols = ['Age'] # is a list containing the names of numerical columns.

preprocessor_onehot = ColumnTransformer( #  is a class that allows you to selectively apply transformers to different subsets of the columns in your data.
    transformers=[
        ('num', 'passthrough', numeric_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])
# "transformers" is a list of tuples where each tuple contains:
#       A name for the transformation ('num' and 'cat' in this case).
#       The transformer to be applied.
#       The subset of columns to which the transformer should be applied.
# For numerical columns ('num'), the transformer is set to 'passthrough', which means those columns will be left unchanged.
# For categorical columns ('cat'), the transformer is set to OneHotEncoder(), which will perform one-hot encoding on those columns.

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# A pipeline is used to streamline a lot of routine processes,
# allowing you to specify a series of data transformations and a final estimator (model) in a single object

# Create a pipeline with preprocessing and linear regression

model_onehot = Pipeline([
    ('preprocessor', preprocessor_onehot),
    ('regressor', LogisticRegression())
])

# The first element of each tuple is a string identifier for the step,
# and the second element is the object that performs the transformation or modeling.


# When you later call model.fit(X_train, y_train), it will apply the preprocessing steps specified in 'preprocessor' to X_train,
# and then fit the linear regression model (LinearRegression()) to the transformed features.

# When you call model.predict(X_test), it will first transform X_test using the same preprocessing steps,
# and then use the linear regression model to make predictions on the transformed features.

In [72]:
# Fit the model
model_onehot.fit(X_train, y_train)
preds_onehot = model_onehot.predict(X_test)

# Evaluate the model on the test set
score_onehot = model_onehot.score(X_test, y_test)


print(mean_squared_error(y_test, preds_onehot))
print(f'R-squared score on test set: {score_onehot:.4f}')

0.21621621621621623
R-squared score on test set: 0.7838


### Enfoque con LABEL ENCODER

In [73]:
# Define a mapping dictionary
gender_mapping = {'male': 0, 'female': 1}

# Apply the mapping to the 'gender' column
X['Sex_encoded'] = X['Sex'].map(gender_mapping)

X

Unnamed: 0,Sex,Age,Sex_encoded
1,female,38.0,1
3,female,35.0,1
6,male,54.0,0
10,female,4.0,1
11,female,58.0,1
...,...,...,...
871,female,47.0,1
872,male,33.0,0
879,female,56.0,1
887,female,19.0,1


In [74]:
X.drop(['Sex'], axis=1, inplace=True)

In [75]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train

Unnamed: 0,Age,Sex_encoded
331,45.5,0
336,29.0,0
193,3.0,0
75,25.0,0
248,37.0,0
...,...,...
520,30.0,1
92,46.0,0
460,48.0,0
872,33.0,0


In [76]:
# Fit the model
model_labelEncoder.fit(X_train, y_train)
preds_labelEncoder = model_labelEncoder.predict(X_test)

# Evaluate the model on the test set
score_labelencoder = model_labelEncoder.score(X_test, y_test)


print(mean_squared_error(y_test, preds_labelEncoder))
print(f'R-squared score on test set: {score_labelencoder:.4f}')

ValueError: A given column is not a column of the dataframe