# **Setting Up the Environment**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#For Data Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split #Splits the data into training and testing sets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures

#Libraires for Classification Models
from sklearn.linear_model import LogisticRegression #Logistic Regression
from sklearn.metrics import classification_report,plot_confusion_matrix #Metrics for analyzing the model performance

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Importing Training Data**

In [None]:
train_data=pd.read_csv("../input/spaceship-titanic/train.csv")
train_data.head()

In [None]:
#Displaying Metadata/Information about Training Data
train_data.info()

In [None]:
#Dropping ID and Name because ID and Name will not have bigger impact on the prediction
train_data=train_data.drop(['PassengerId','Name'],axis=1)

In [None]:
#checking for Null Values
#Checking columns with missing data
Miss_Percent=100*(train_data.isnull().sum()/len(train_data))
Miss_Percent

In [None]:
#Filling Null values with most occurring class
train_data=train_data.apply(lambda x: x.fillna(x.value_counts().index[0]))

# **Explainatory Data Analysis**

In [None]:
train_data.describe().T

**Age Distribution of Each Planet that were Transported or Not**

In [None]:
sns.displot(
    train_data, x="Age", col="HomePlanet", row="Transported",
    binwidth=3, height=3, facet_kws=dict(margin_titles=True),
)

**Overall Distribution of Features in the Training data**

In [None]:
i = 1
plt.figure(figsize = (15,25))
feature=['HomePlanet','CryoSleep','Destination','VIP','Transported']
for feature in feature:
    plt.subplot(6,3,i)
    sns.countplot(x = feature , hue='Transported', data =train_data)
    i +=1

In [None]:
objList = train_data.select_dtypes(include = "object").columns
print (objList)

#Label Encoding for object to numeric conversion
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feat in objList:
    train_data[feat] = le.fit_transform(train_data[feat].astype(str))

train_data.head()

In [None]:
X=train_data.drop('Transported',axis=1)
y=train_data['Transported']

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2)

print("X_train shape: ",X_train.shape)
print("X_val shape: ",X_val.shape)
print("y_train shape: ",y_train.shape)
print("y_val shape: ",y_val.shape)

**Logistic Regression**

In [None]:
Input=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',LogisticRegression(solver='liblinear'))]

pipe=Pipeline(Input)
pipe
pipe.fit(X_train,y_train)
pipe_pred_LR = pipe.predict(X_val)
pipe_rep_LR = classification_report(y_val, pipe_pred_LR)
print('\t\t\tClassification report:\n\n', pipe_rep_LR, '\n')
plot_confusion_matrix(pipe, X_val, y_val) 

# **Test Data**

In [None]:
test_data=pd.read_csv("../input/spaceship-titanic/test.csv")
test_data.head()

Dropping columns like passengerid and names because they are irrelevant 

In [None]:
#Dropping ID and Name because ID and Name will not have bigger impact on the prediction
test_data=test_data.drop(['PassengerId','Name'],axis=1)
test_data.info()

**Encoding the Categorical Features of the data**

In [None]:
objList = test_data.select_dtypes(include = "object").columns
print (objList)

for feat in objList:
    test_data[feat] = le.fit_transform(test_data[feat].astype(str))

test_data.head()

In [None]:
#Filling Null values with most occurring class
test_data=test_data.apply(lambda x: x.fillna(x.value_counts().index[0]))

**Using the Model on Test Data**

In [None]:
test_pred_LR = pipe.predict(test_data)

**Storing the Output on Sample Submission**

In [None]:
#Creating Dataframe to Display the output, The Id is the item id from the test data and output is the predicted cnt_per_month
sample_submission= pd.read_csv("../input/spaceship-titanic/sample_submission.csv")
sample_submission.Transported=test_pred_LR
LR_results=sample_submission
print(LR_results)

In [None]:
LR_results.to_csv("submission.csv",index=False)
print("Completed")

**Completed**

**Thank you for your feedbacks and Please upvote it if this notebook helped to learn something new**