### Data Collection

In [1]:
import pandas as pd

In [3]:
df=pd.read_csv('nationality.csv')
df.tail()

Unnamed: 0,ID,Name,Nationality
886,887,"Montvila, Rev. Juozas",CelticEnglish
887,888,"Graham, Miss. Margaret Edith",CelticEnglish
888,889,"Johnston, Miss. Catherine Helen ""Carrie""",CelticEnglish
889,890,"Behr, Mr. Karl Howell","European,German"
890,891,"Dooley, Mr. Patrick",CelticEnglish


In [4]:
df.shape

(891, 3)

In [5]:
df.size

2673

### Data Visualization and Analysis

In [6]:
#Linear Algebra Library
import numpy as np
#Data Analysis and Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style

In [7]:
#Gender Plot
sns.catplot('Sex',data=df,kind='count')



ValueError: Could not interpret input 'Sex'

In [None]:
#Class Plot
sns.catplot('Pclass',data=df,kind='count')

In [None]:
# Majority passengers are from Class3 , now lets find relationship between Gender ratio among classes
sns.countplot(x='Pclass',hue='Sex',data=df)

In [None]:
#Age of passengers
df['Age'].plot.hist(bins=70,figsize=(10,5))

In [None]:
#Age groups amongst different classes
fig=sns.FacetGrid(df,hue='Pclass',aspect=5) 
fig.map(sns.kdeplot,'Age',shade=True)
oldest=df['Age'].max()
fig.set(xlim=(0,oldest))
fig.add_legend()
#more number of passengers between age group 20 to 40, are travelling in all three classes.

In [None]:
#Average age group within each class
sns.boxplot(x="Pclass",y="Age",data=df)

In [None]:
#Survivors and Non Survivors count
sns.countplot(x='Survived',data=df)

In [None]:
import matplotlib.pyplot as plt

In [None]:
df['Nationality']

In [None]:
#Relationship between Survival and Gender of passengers
sns.countplot(x='Survived',hue='Sex',data=df)

In [None]:
#Relationship between Survival of the passenger and the class he/she is travelling in
sns.countplot(x='Survived',hue='Pclass',data=df)

In [None]:
sns.factorplot('Pclass','Survived',data=df,hue='Sex')

In [None]:
df.count('Nationality')

In [None]:
#Relationship between Survival, Age and Pclass of the passenger
sns.lmplot(x='Age',y='Survived',hue='Pclass',data=df)
# this graph shows, older the passenger, lesser the chance of survival

In [None]:
sns.lmplot(x='Age',y='Survived',hue='Sex',data=df)
#Graph shows men had lesser survival chances than women and that elder men and women both has less survivl chances

In [None]:
sns.lmplot(x='Age',y='Survived',hue='Embarked',data=df)
#Survival rate is higher for passengers who boarded at Cherbour than Southhampton

In [None]:
# Features such as Age,Sex,Pclass,Embarked are important features for making predictions.
# PassengerID,Name,Ticket,SibSp,Parch,Cabin,Fare are not important for predictions.

In [None]:
df.head()

### Data Wrangling

In [None]:
#We check for null values
df.isnull().any()
df.isnull().sum()
# Column 'Age','Cabin' and 'Embarked' have null values. 

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cmap='viridis')

In [None]:
# we convert null values in NaN (Not a Number)
df['Age']=df['Age'].replace(0,np.NaN)

In [None]:
#find the mean of the age column and replace it with NaN
mean=int(df['Age'].mean(skipna=True))
# mean comes out 29
df['Age']=df['Age'].replace(np.NaN,mean)
df.Age

In [None]:
#now we drop unnecessary columns from our Data Frame
# Cabin, Ticket,PassengerId,Name, Fare,SibSp,Parch, these are not necessary for prediction
df=df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Fare','Cabin'],axis='columns')

In [None]:
df.head()

In [None]:
# now we must convert string data into numeric values by creating pandas dummies
sex=pd.get_dummies(df['Sex'],drop_first=True)
sex.head()

In [None]:
# we drop the 'Sex' column since we already have created its dummies
df=df.drop(['Sex'],axis='columns')

In [None]:
# now we concatinate the dummy column with our data frame
df=pd.concat([df,sex],axis='columns')

In [None]:
df.head()

In [None]:
# create dummies for embarked string values
# 0-Q,1-Southampton
embarked=pd.get_dummies(df['Embarked'],drop_first=True)
embarked.head()

In [None]:
df=df.drop(['Embarked'],axis='columns')

In [None]:
# now concatinate dummy embarked into the dataframe
df=pd.concat([df,embarked],axis='columns')

In [None]:
df.head()

In [None]:
input=df.drop(['Survived'],axis='columns')

In [None]:
# 'target' is the output, i.e, the predictor or dependent variable
target=df.Survived

In [None]:
target.head()

In [None]:
input.head()

### Creating Model and train-test Split using Logistic Regression

### Importing required libraries

# Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(input,target,test_size=0.2,random_state=1)

In [None]:
# we create an object for LogisticRegression
model=LogisticRegression()
# in this step we fit our training data into our model
model.fit(X_train,y_train)

In [None]:
# check the score of the model
model.score(X_test,y_test)

In [None]:
# Predicting the target from Testing Data
y_pred=model.predict(X_test)
y_pred

In [None]:
# check accuracy of the prediction
from sklearn.metrics import accuracy_scoreaccuracy_score

In [None]:
accuracy=accuracy_score(y_test,y_pred)
accuracy

In [None]:
# creating confusion matrix
from sklearn.metrics import confusion_matrix

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm

In [None]:
# classification report
from sklearn.metrics import classification_report

In [None]:
classification_report(y_test,y_pred)

# Support Vector Machine

In [None]:
#Support vector machine
from sklearn import svm

In [None]:
model=svm.SVC()

In [None]:
model.fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred

In [None]:
accuracysvm=accuracy_score(y_test,y_pred)

In [None]:
accuracysvm

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
classification_report(y_test,y_pred)

# Naive Bayes

In [None]:
### Naive Bayes
from sklearn.naive_bayes import GaussianNB

In [None]:
model=GaussianNB()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred

In [None]:
model.score(X_test,y_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracyNB=accuracy_score(y_test,y_pred)

In [None]:
accuracyNB

In [None]:
confusion_matrix(y_test,y_pred)

# Decision Tree

In [None]:
### Decision tree
from sklearn import tree
model=tree.DecisionTreeClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred

In [None]:
accuracyDT=accuracy_score(y_test,y_pred)
accuracyDT

In [None]:
confusion_matrix(y_test,y_pred)

# KNN 

In [None]:
### KNN
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred

In [None]:
accuracyKNN=accuracy_score(y_test,y_pred)
accuracyKNN

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
import matplotlib.pyplot as plt

In [None]:
Algorithms=['LogReg','SVM','NaiveBayes','DecisionTree','KNN']
Accuracy=[78,59,76,76,73]

plt.bar(Algorithms,Accuracy,color='red')
plt.xlabel('Algorithms')
plt.ylabel('Accuracy in %')
plt.title('Algorithms Accuracy Bar')