<a href="https://www.kaggle.com/code/joaopedromorais1989/spaceship-titanic?scriptVersionId=168155252" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import warnings 
warnings.filterwarnings("ignore")
scale = StandardScaler()

def economic_status(person):
    # Creating new column
    peopleChoices = []
    peopleChoices.append(person.VIP == 0.0)
    peopleChoices.append(person.RoomService == 0.0)
    peopleChoices.append(person.FoodCourt == 0.0)
    peopleChoices.append(person.ShoppingMall == 0.0)
    peopleChoices.append(person.Spa == 0.0)
    peopleChoices.append(person.VRDeck == 0.0)
    
    count = peopleChoices.count(True)
    if count >=5:
        return 1
    elif count >= 3 and count < 5:
        return 2
    elif count >= 1 and count < 3:
        return 3
    else:
        return 0

def dropColumns(df):
    # Dropping Columns
    #df.drop("Age",axis=1,inplace=True)
    df.drop("PassengerId",axis=1,inplace=True)
    df.drop(["Name"],inplace=True,axis=1)
    df.drop("Cabin",axis=1,inplace=True)
    return df

def scaleData(df,isTrain):
    # Scaling data
    arrayOfColumns = ['RoomService', 'FoodCourt',"ShoppingMall","VRDeck","Age","Spa"]
    if(isTrain):
        df[arrayOfColumns] = scale.fit_transform(df[arrayOfColumns])
    else:
        df[arrayOfColumns] = scale.transform(df[arrayOfColumns])
    return df
    
def handleNa(df):
    # Filling and Droping NA
    df.Age.fillna(df.Age.mean(),inplace=True)
    df.Spa.fillna(df.Spa.mean(),inplace=True)
    df.RoomService.fillna(df.RoomService.mean(),inplace=True)
    df.FoodCourt.fillna(df.FoodCourt.mean(),inplace=True)
    df.ShoppingMall.fillna(df.ShoppingMall.mean(),inplace=True)
    df.VRDeck.fillna(df.VRDeck.mean(),inplace=True)
    df.VIP.fillna(df.VIP.mode().iat[0],inplace=True)
    df.Destination.fillna(df.Destination.mode().iat[0],inplace=True)
    df.HomePlanet.fillna(df.HomePlanet.mode().iat[0],inplace=True)
    df.CryoSleep.fillna(False,inplace=True)
    df.Cabin.fillna(df.Cabin.mode().iat[0],inplace=True)
    return df


def prepare(df,isTrain):
    df = handleNa(df)
    df = scaleData(df,isTrain)
    df["Economic_Class"] = df.apply(economic_status,axis=1)
    df["Deck"] = df["Cabin"].transform(lambda x: x.split("/")[0])
    df["Side"] = df["Cabin"].transform(lambda x: x.split("/")[2])
    df = dropColumns(df)
    df = pd.get_dummies(df, columns = ['HomePlanet', 'Destination',"Deck","Side"],  dtype=int, drop_first=True)
    return df

def saveFile(passengerIds,scores):
    # Saving the file
    passengerId = []
    survived = []
    i = 0
    for sc in scores:
        passengerId.append(passengerIds.iloc[i])
        survived.append(sc)
        i +=1
    list_of_tuples = list(zip(passengerId, survived)) 
    df = pd.DataFrame(list_of_tuples, columns=['PassengerId', 'Transported']) 
    print(df.describe())
    df.to_csv("submission.csv", index = False)
    print("Finished")
    
def gridSearch(x,y):
    # GRID SEARCH
    pipe = Pipeline([('clf', svm.SVC())])   
    search_space = [{
                        'clf': [LogisticRegression()],
                        'clf__penalty': ['l2'],
                        'clf__C': np.logspace(0, 4, 10)
                    },
                    {
                         'clf': [DecisionTreeClassifier()],
                        'clf__max_depth': range(3,60,2)
                    }]

    search = GridSearchCV(pipe, search_space, cv=10,verbose=3,error_score='raise')
    search.fit(train,train_y)
    print(pd.concat([pd.DataFrame(search.cv_results_["params"]),pd.DataFrame(search.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1))
    
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

train = prepare(train,True)
train_y = train.Transported
train.drop("Transported",axis=1,inplace=True)
train.head()

testPassengerIds = test.PassengerId
test = prepare(test,False)
#gridSearch(train,train_y)

clf = svm.SVC(C=7.5)
clf.fit(train,train_y)
saveFile(testPassengerIds,clf.predict(test))
 

 

 



       PassengerId Transported
count         4277        4277
unique        4277           2
top        0013_01        True
freq             1        2189
Finished
