In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [27]:
# Load the Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic_data = pd.read_csv(url)

In [28]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [29]:
type(titanic_data)

pandas.core.frame.DataFrame

*   Place these four columns in a variable X - **'Pclass', 'Sex', 'Age', 'Fare'**
*   Place this column in a variable y - **'Survived'**

In [30]:
features = ['Pclass', 'Sex', 'Age', 'Fare']

In [31]:
# Select the features and target variable
features = ['Pclass', 'Sex', 'Age', 'Fare']
target = 'Survived'
X = titanic_data[features]
y = titanic_data[target]

In [32]:
# They mean the same

# features = ['Pclass', 'Sex', 'Age', 'Fare']
# X = titanic_data[features]

# X = titanic_data[['Pclass', 'Sex', 'Age', 'Fare']]

In [33]:
# They mean thge same

# target = 'Survived'
# y = titanic_data[target]

# y = titanic_data['Survived']

In [34]:
y = titanic_data[['Survived']]
y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [35]:
type (y)

pandas.core.frame.DataFrame

In [36]:
X

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.2500
1,1,female,38.0,71.2833
2,3,female,26.0,7.9250
3,1,female,35.0,53.1000
4,3,male,35.0,8.0500
...,...,...,...,...
886,2,male,27.0,13.0000
887,1,female,19.0,30.0000
888,3,female,,23.4500
889,1,male,26.0,30.0000


In [37]:
type(X)

pandas.core.frame.DataFrame

In [38]:
type(X['Age'])

pandas.core.series.Series

In [39]:
X['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [40]:
y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [41]:
type(y)

pandas.core.frame.DataFrame

In [42]:
# Preprocess the data
numeric_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex']

In [43]:
numeric_features

['Age', 'Fare']

In [44]:
categorical_features

['Pclass', 'Sex']

In [45]:
X[categorical_features]

Unnamed: 0,Pclass,Sex
0,3,male
1,1,female
2,3,female
3,1,female
4,3,male
...,...,...
886,2,male
887,1,female
888,3,female
889,1,male


In [46]:
X[categorical_features].nunique()

Pclass    3
Sex       2
dtype: int64

In [47]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [48]:
X = preprocessor.fit_transform(X)
X

array([[22.        ,  7.25      ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [38.        , 71.2833    ,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [26.        ,  7.925     ,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [29.69911765, 23.45      ,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [26.        , 30.        ,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [32.        ,  7.75      ,  0.        , ...,  1.        ,
         0.        ,  1.        ]])

In [49]:
X[0]

array([22.  ,  7.25,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ])

In [50]:
X[1]

array([38.    , 71.2833,  1.    ,  0.    ,  0.    ,  1.    ,  0.    ])

In [None]:
X.shape

(891, 7)

In [None]:
X[0]

array([22.  ,  7.25,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = clf.predict(X_train)

# Evaluate the model
accuracy = accuracy_score(y_train, y_pred)
print("Training Accuracy:", accuracy)

Training Accuracy: 0.9789325842696629


In [None]:
# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.7486033519553073
