In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test['PassengerId']

In [3]:
print(train.shape)
print(test.shape)

(8693, 14)
(4277, 13)


In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [5]:
train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [6]:
def clean(df: pd.DataFrame):
    df = df.drop(['PassengerId', 'Cabin', 'Name'], axis=1)
    cats = ['HomePlanet', 'Destination']
    to_num = ['CryoSleep', 'VIP']
    for cat in cats:
        df[cat] = le.fit_transform(df[cat])
        print(le.classes_)
    for col in to_num:
        df[col] = df[col].astype(bool).astype(int)
    df = df.fillna(df.median())
    return df

In [7]:
train = clean(train)
test = clean(test)

['Earth' 'Europa' 'Mars' nan]
['55 Cancri e' 'PSO J318.5-22' 'TRAPPIST-1e' nan]
['Earth' 'Europa' 'Mars' nan]
['55 Cancri e' 'PSO J318.5-22' 'TRAPPIST-1e' nan]


# Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [9]:
X = train.drop('Transported', axis=1)
y = train['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)
predictions = clf.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_score(y_val, predictions)

0.772857964347326

In [11]:
submission = clf.predict(test)
df = pd.DataFrame({'PassengerId': test_ids.values,
                    'Transported': submission,})
df['Transported'] = df['Transported'].astype(bool)
df.head(5)

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


In [12]:
df.to_csv('PabloSegovia_SpaceshipTitanic.csv', index=False)

For LogisticRegression the Kaggle score was: 0.78700

# Support Vector Machines

In [13]:
from sklearn import svm

In [15]:
# Usage of SVClassifier
svc = svm.SVC()
svc.fit(X_train, y_train)

SVC()

In [16]:
predictions = svc.predict(X_val)

In [17]:
accuracy_score(y_val, predictions)

0.7734330074755607

In [18]:
submission = svc.predict(test)
df = pd.DataFrame({'PassengerId': test_ids.values,
                    'Transported': submission,})
df['Transported'] = df['Transported'].astype(bool)
df.head(5)

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [19]:
df.to_csv('PabloSegovia_SpaceshipTitanic_SVC.csv', index=False)

For SVC the Kaggle score was: 0.79003

### Nu

In [23]:
def df_to_submission(clsf):
    clsf.fit(X_train, y_train)
    predictions = clsf.predict(X_val)
    print('Accuracy Score: ', accuracy_score(y_val, predictions))
    submission = clsf.predict(test)
    df = pd.DataFrame({'PassengerId': test_ids.values,
                    'Transported': submission,})
    df['Transported'] = df['Transported'].astype(bool)
    return df

# LinearSVC

In [22]:
from sklearn.svm import LinearSVC
lsvc = LinearSVC(random_state=42, tol=1e-05)

In [24]:
df = df_to_submission(lsvc)

Accuracy Score:  0.730879815986199




Worse than the last two attempts. No submission for it.

# K-Nearest neighbors

In [25]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

In [26]:
df = df_to_submission(knn)

Accuracy Score:  0.7751581368602645


In [27]:
df.to_csv('PabloSegovia_SpaceshipTitanic_KNN.csv', index=False)

Kaggle message: Your submission scored 0.78092, which is not an improvement of your previous score. Keep trying!

May be the n_neighbors has a change in the accuracy score:

In [29]:
for i in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_val)
    print(f'Accuracy Score for {i}_neighbors: ', accuracy_score(y_val, predictions))

Accuracy Score for 1_neighbors:  0.7119033927544566
Accuracy Score for 2_neighbors:  0.7251293847038528
Accuracy Score for 3_neighbors:  0.7521564117308798
Accuracy Score for 4_neighbors:  0.7487061529614721
Accuracy Score for 5_neighbors:  0.7751581368602645
Accuracy Score for 6_neighbors:  0.7722829212190915
Accuracy Score for 7_neighbors:  0.7757331799884991
Accuracy Score for 8_neighbors:  0.7694077055779184
Accuracy Score for 9_neighbors:  0.78205865439908
Accuracy Score for 10_neighbors:  0.7791834387579069
Accuracy Score for 11_neighbors:  0.7780333525014376
Accuracy Score for 12_neighbors:  0.7711328349626222
Accuracy Score for 13_neighbors:  0.7751581368602645
Accuracy Score for 14_neighbors:  0.7751581368602645
Accuracy Score for 15_neighbors:  0.777458309373203
Accuracy Score for 16_neighbors:  0.7757331799884991
Accuracy Score for 17_neighbors:  0.7780333525014376
Accuracy Score for 18_neighbors:  0.7734330074755607
Accuracy Score for 19_neighbors:  0.7763082231167338


In [30]:
# Best accuracy score was achieved using n_neighbors = 9
knn = KNeighborsClassifier(n_neighbors=9)
df = df_to_submission(knn)

Accuracy Score:  0.78205865439908


In [31]:
df.to_csv('PabloSegovia_SpaceshipTitanic_KNN.csv', index=False)

kaggle score: Your submission scored 0.78489, which is not an improvement of your previous score. Keep trying!

# Decision Tree Classifier

In [32]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [33]:
df = df_to_submission(dtc)

Accuracy Score:  0.7343300747556066


Worse accuracy_score