In [1]:
# Import Modules
import pandas as pd
from pathlib import Path
from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

In [2]:
# Import Dependencies for Download and Clean
# run in terminal to use bigquery  "pip install --upgrade google-cloud-bigquery"
#import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file(
'Documents/spaceship_titanic/spaceship-titanic-387720-729aac731f9f.json')

ModuleNotFoundError: No module named 'google'

In [None]:
project_id = 'spaceship-titanic-387720'
client = bigquery.Client(credentials= credentials,project=project_id)

In [None]:
query_Amenities = client.query("""
   SELECT *
   FROM Starship_Titanic.Amenities""")

query_PassengerInfo = client.query("""
   SELECT *
   FROM Starship_Titanic.PassengerInfo""")

query_PlanetInfo = client.query("""
   SELECT *
   FROM Starship_Titanic.Planet""")

results_Amenities = query_Amenities.result()
results_PassengerInfo = query_PassengerInfo.result()
results_PlanetInfo = query_PlanetInfo.result()

Amenities_df = pd.DataFrame(results_Amenities)
PassengerInfo_df = pd.DataFrame(results_PassengerInfo)
PlanetInfo_df = pd.DataFrame(results_PlanetInfo)

In [None]:
print(Amenities_df.loc[0][0])
print(PassengerInfo_df.loc[0][0])
print(PlanetInfo_df.loc[0][0])

In [None]:
Amenities_sorted = pd.DataFrame()
i = 0
while i < 8:
    Amenities_sorted[i] = Amenities_df[0].apply(lambda x: x[i])
    i += 1

Amenities_sorted = Amenities_sorted.rename(columns={0: "PassengerId", 
                   1: "Cabin", 
                   2: "VIP", 
                   3: "RoomService",
                   4: "FoodCourt",
                   5: "ShoppingMall",
                   6: "Spa",
                   7: "VRDeck"})

In [None]:
Amenities_sorted

In [None]:
PassengerInfo_sorted = pd.DataFrame()
i = 0
while i < 6:
    PassengerInfo_sorted[i] = PassengerInfo_df[0].apply(lambda x: x[i])
    i += 1
    
PassengerInfo_sorted = PassengerInfo_sorted.rename(columns={0: "PassengerId", 
                   1: "Name", 
                   2: "HomePlanet", 
                   3: "Cabin",
                   4: "Age",
                   5: "Transported"})

In [None]:
PassengerInfo_sorted

In [None]:
PlanetInfo_sorted = pd.DataFrame()
i = 0
while i < 4:
    PlanetInfo_sorted[i] = PlanetInfo_df[0].apply(lambda x: x[i])
    i += 1
    
PlanetInfo_sorted = PlanetInfo_sorted.rename(columns={0: "PassengerId", 
                   1: "HomePlanet", 
                   2: "Destination", 
                   3: "CryoSleep"})

In [None]:
PassengerInfo_sorted

In [None]:
# Merging tables
merge_df = pd.merge(Amenities_sorted, PassengerInfo_sorted, on = 'PassengerId', how = 'inner')
df = pd.merge(merge_df, PlanetInfo_sorted, on = 'PassengerId', how = 'inner')

In [1]:
df.head()

NameError: name 'df' is not defined

In [None]:
# Cleaning up Column Duplications
df = df.drop(['Cabin_y', 'HomePlanet_y'], axis=1)

In [None]:
# Renaming Columns
df = df.rename(columns={'Cabin_x': "Cabin", 
                   'HomePlanet_x': "HomePlanet"})
df

In [None]:
df["Transported"].value_counts()

In [None]:
df["Transported"].dropna()

In [None]:
#check dataframe
print(df.shape)
df.head(5)

In [None]:
# Check columns
columns = list(df.columns)
columns

In [None]:
df.isnull().sum()

In [None]:
# Data Munging

# Make plans
'''
1) Check for missing values in all columns
1.5) Consider Imputation
2) remove unnecessary columns such as PassengerId and Name
3) Convert categorical variables into indicator variables for HomePlanet, CryoSleep, Deck, Side, Destination, VIP, Transported
4) Split up Cabin column into three different features
'''

In [None]:
# Split Cabin column in three different Columns
df[['Deck','RoomNum','Side']] = df.Cabin.str.split("/", expand = True)
del df['Cabin']

# Reorder dataset to make me happy
df = df[['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Deck',
 'RoomNum',
 'Side',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name',
 'Transported']]

# Check to see everything is going to plan
print(df.head())

In [None]:
# Drop unnecessary columns
del df['PassengerId']
del df['Name']

In [None]:
# Check drops
columns = list(df.columns)
columns

In [None]:
# examining missing values
print("Missing values distribution: ")
print(df.isnull().mean())

In [None]:
# check datatype in each column
print("Column datatypes: ")
print(df.dtypes)

In [None]:
#df2['Transported'] = df2['Transported'].map({'True': True, 'False': False}) 
df['Transported'] = df['Transported'].astype(dtype = 'bool', errors = 'ignore')

In [None]:
# Exploring the data
print('HomePlanet', df["HomePlanet"].unique())
print('CryoSleep', df["CryoSleep"].unique())
print('Deck', df["Deck"].unique())
print('Side', df["Side"].unique())
print('Destination', df["Destination"].unique())
print('VIP', df["VIP"].unique())
print('Transported', df["Transported"].unique())

In [None]:
# check datatype in each column
print("Column datatypes: ")
print(df.dtypes)

In [None]:
# Exploring the data
print('HomePlanet', df["HomePlanet"].unique())
print('CryoSleep', df["CryoSleep"].unique())
print('Deck', df["Deck"].unique())
print('Side', df["Side"].unique())
print('Destination', df["Destination"].unique())
print('VIP', df["VIP"].unique())
print('Transported', df["Transported"].unique())

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df["Transported"].value_counts()

In [None]:
#Nan Values

df_clean = df.dropna()
df_clean

In [None]:
compression_opts = dict(method='zip',
                        archive_name='titanic.csv')  
df.to_csv('titanic.zip', index=False,
          compression=compression_opts)  

In [None]:
#KNeighbors Model 

In [None]:
 # Read the CSV file into a Pandas DataFrame
df = pd.read_csv(
    Path('Documents/spaceship_titanic/titanic.csv')
)
df.head()

In [None]:
 # Split target column from dataset
y = df['Transported']
X = df.drop(columns='Transported')

In [None]:
# Preview the data
X[:5]

In [None]:
# Print first five entries for target
y[:5]

In [None]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [None]:
# Preview the data
X.head()

In [None]:
 # Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
#Implementation of KNN

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [None]:
# use cross_val_score function
# create empty list to store neighbors
# create empty list to store scores

from sklearn.model_selection import cross_val_score

neighbors = []
cv_scores = []

# Perform 10-fold cross validation with K=5 for KNN 
# Range of K we want to try
# k = 5 for KNeighborsClassifier

for k in range(1, 101, 2):
    neighbors.append(k)
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(
    knn, X_train, y_train, cv = 10, scoring = 'accuracy')
    cv_scores.append(scores.mean())
    
    # Passing the entirety of X and y, not X_train or y_train, it takes care of splitting the data
    # cv=10 for 10 folds
    # Scoring='accuracy' for evaluation metric
    
scores = cross_val_score(knn, X, y, cv=10, scoring = 'accuracy')
print(scores)


In [None]:
# Misclassification error versus k
MSE = [1-x for x in cv_scores]

# Determining the best k value
optimal_k = neighbors[MSE.index(min(MSE))]
print("The optimal score of K neighbors = %d " %optimal_k)

# Import numpy and matplotlib
import numpy as np
import matplotlib.pyplot as plt 

# Plot misclassification error versus k
plt.figure(figsize = (10,6))
plt.plot(neighbors, MSE)
plt.xlabel("Number of K neighbors")
plt.ylabel("Misclassification Error")
plt.show()

In [None]:
# Instantiate the model
model = KNeighborsClassifier(n_neighbors=43)

In [None]:
# Train the model
model.fit(X_train_scaled, y_train)

In [None]:
# Create predictions
y_pred = model.predict(X_test_scaled)

# Review the predictions
y_pred

In [None]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

In [None]:
# Print classification report
print(classification_report(y_pred,y_test))