In [1]:
import pandas as pd
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data_folder = "Data"

In [3]:
train_path = os.path.join(data_folder, "train.csv")
train_data = pd.read_csv(train_path)
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
test_path = os.path.join(data_folder, "test.csv")
test_data = pd.read_csv(test_path)
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
gender_path = os.path.join(data_folder, "gender_submission.csv")
gender_submission = pd.read_csv(gender_path)
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


### GridSearchCV

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the target and features as per the original code
y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])

# Splitting the training data into training and validation sets
train_features, val_features, train_labels, val_labels = train_test_split(
    X, y, test_size=0.2, random_state=1)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(train_features, train_labels)

# Make predictions on the validation set
val_predictions = model.predict(val_features)

# Calculate and print accuracy
accuracy = accuracy_score(val_labels, val_predictions)
accuracy



0.7597765363128491

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

def train_and_evaluate(X_train, y_train):
    # Example parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 8, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5]
    }

    # Create a RandomForestClassifier
    rf = RandomForestClassifier()

    # Setup GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                            cv=5, n_jobs=-1, scoring='accuracy', verbose=2)

    # Fit grid_search to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters and score
    best_parameters = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_parameters, best_score


In [8]:
best_parameters, best_score = train_and_evaluate(X, y)

print("Best Parameters:", best_parameters)
print("Best Score:", best_score)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best Score: 0.8036281463812692


### TPOT

In [9]:
import pandas as pd
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split




In [23]:
train_data = pd.read_csv(train_path)

train_data.fillna(method='ffill', inplace=True)

# X = train_data.drop('Survived', axis=1)
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
y = train_data['Survived']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)

tpot.score(X_valid, y_valid)

  train_data.fillna(method='ffill', inplace=True)


                                                                            
Generation 1 - Current best internal CV score: -inf
                                                                            

RuntimeError: There was an error in the TPOT optimization process. This could be because the data was not formatted properly, or because data for a regression problem was provided to the TPOTClassifier object. Please make sure you passed the data to TPOT correctly. If you enabled PyTorch estimators, please check the data requirements in the online documentation: https://epistasislab.github.io/tpot/using/

In [22]:
print(X_train.dtypes)
print(X_valid.dtypes)


Pclass        int64
SibSp         int64
Parch         int64
Sex_female     bool
Sex_male       bool
dtype: object
Pclass        int64
SibSp         int64
Parch         int64
Sex_female     bool
Sex_male       bool
dtype: object


## Test

In [None]:
X_test = pd.get_dummies(test_data[features])
y_pred = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_pred})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")