In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
result_test_data = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
df.shape

(891, 12)

In [6]:
# as we see there are almost all rows empty in cabin column so we will drop that 
# also there is no use of column like Name,Passengerid,'Ticket','Fare'
# we will also drop them also we will create a function of all the operations that we perform for cleaning the 
# data set so that same process can be peformed in test dataset just by passing that in the define function

In [7]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
def cleaning_n_get_required_data(df):
    # dropping all unnecessary columns 
    df.drop(columns = ['PassengerId','Name','Ticket','Fare','Cabin'],inplace = True)
    # treating NaN values in Age column
    from sklearn.impute import KNNImputer
    nullhandler = KNNImputer()
    # here nullhandler is used because we do not want our distribution to change
    # and while handling null's do not want ot pollute our data.
    df.Age = pd.DataFrame(nullhandler.fit_transform(df.loc[:, df.columns.isin(['Age'])]))[0]
    # treating NaN values in Embarked column
    df.Embarked.fillna(df.Embarked.mode()[0],inplace=True)
    # encoding the non-numerical columns to make them numeric
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder() 
    # using LabelEncoder for Sex column
    df.Sex = le.fit_transform(df.Sex)
    # OneHotEncoding for Embarked Column as there are more than 2 values in embarked
    df = pd.get_dummies(df,columns= ['Embarked'])
    # now returning the output that we will use for model training
    return df
    # we do not treat any outliers here because we want our model to train on each and every data point
       
    
    

In [9]:
train_df = cleaning_n_get_required_data(df)

In [10]:
# so we have now obtained our train data ready with us
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.000000,1,0,0,0,1
1,1,1,0,38.000000,1,0,1,0,0
2,1,3,0,26.000000,0,0,0,0,1
3,1,1,0,35.000000,1,0,0,0,1
4,0,3,1,35.000000,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,0,0,1
887,1,1,0,19.000000,0,0,0,0,1
888,0,3,0,29.699118,1,2,0,0,1
889,1,1,1,26.000000,0,0,1,0,0


In [11]:
# now let's get our test data ready
# for that we need to join the 2 tables that we have to make test_df
test_df = pd.merge(test,result_test_data,on = 'PassengerId')

In [12]:
# taking a look at out test_df
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [13]:
# passing out test data as well to our cleaning function
test_df = cleaning_n_get_required_data(test_df)

In [14]:
test_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Survived,Embarked_C,Embarked_Q,Embarked_S
0,3,1,34.50000,0,0,0,0,1,0
1,3,0,47.00000,1,0,1,0,0,1
2,2,1,62.00000,0,0,0,0,1,0
3,3,1,27.00000,0,0,0,0,0,1
4,3,0,22.00000,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...
413,3,1,30.27259,0,0,0,0,0,1
414,1,0,39.00000,0,0,1,1,0,0
415,3,1,38.50000,0,0,0,0,0,1
416,3,1,30.27259,0,0,0,0,0,1


In [15]:
x_train = train_df.iloc[:,1:]
y_train = train_df.Survived

In [16]:
x_test = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked_C',
       'Embarked_Q', 'Embarked_S']]
y_test = test_df.Survived

In [17]:
# using DecisionTree Model for our data
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [30]:
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression()

logr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [36]:
# using our model to predict the output
y_predict = logr.predict(x_test)
accuracy_score(y_test,y_predict)

0.9473684210526315

In [33]:
y_train_pred = logr.predict(x_train)

In [34]:
# and our training accuracy is 0.93 which is very good
accuracy_score(y_train,y_train_pred)

0.8024691358024691

In [35]:
result_test_data.shape

(418, 2)

In [37]:
predicted_data = pd.concat([result_test_data.PassengerId,pd.DataFrame(y_predict,columns = ['Survived'])],axis=1)

In [38]:
predicted_data.to_csv('titanic_data_model.csv',index = False)

In [39]:
pd.DataFrame(y_predict,columns=['Survived'])

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0
