In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train_data = pd.read_csv('../input/titanic/train.csv')
test_data = pd.read_csv('../input/titanic/test.csv')
#train_data.shape
#train_data.describe()
#train_data.dtypes

In [None]:
print(train_data.columns)
print(train_data.info())

In [None]:
sns.pairplot(train_data)

In [None]:
sns.stripplot(data = train_data, x = 'Survived', y = 'Age', hue = 'Sex', dodge = True, jitter = True)

In [None]:
sns.boxplot(data = train_data, x = 'Survived', y = 'Age', hue = 'Pclass')

In [None]:
sns.countplot(data = train_data, x = 'Survived', hue = 'Sex')

In [None]:
cat_columns = [x for x in train_data.columns if train_data[x].dtype == 'object']
print(cat_columns)
print(train_data.info())
temp_data = pd.get_dummies(train_data)
numerical_data = train_data.drop(cat_columns, axis = 1, inplace = False)
sns.heatmap(temp_data.corr(), annot = True, cmap = 'coolwarm')

In [4]:
cat_columns = [x for x in train_data.columns if train_data[x].dtype == 'object']
cat_frame = train_data[cat_columns]
print(cat_frame.info())
dumb_frame = pd.get_dummies(cat_frame)
numerical_data = train_data.drop(cat_columns, axis = 1, inplace = False)
temp_frame = pd.concat([dumb_frame, numerical_data])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      891 non-null    object
 1   Sex       891 non-null    object
 2   Ticket    891 non-null    object
 3   Cabin     204 non-null    object
 4   Embarked  889 non-null    object
dtypes: object(5)
memory usage: 34.9+ KB
None


In [None]:
sns.heatmap(temp_frame.corr(), annot = True, cmap = 'coolwarm')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 1724 entries, Name_Abbing, Mr. Anthony to Embarked_S
dtypes: bool(1724)
memory usage: 1.5 MB
None


In [None]:
'''#returns a dataframe with only entries that have a null values in a particular field
def null_entries(field):
    null_values = train_data[[field]].isnull()
    null_values = null_values[null_values[field] == True]
    return null_values
'''

In [None]:
sns.scatterplot(x= train_data['Age'], y = train_data['Fare'], hue = train_data['Survived'])

In [None]:
sns.lmplot(x = 'Age', y = 'Fare', hue = 'Survived', data = train_data)

In [None]:
import seaborn as sns
sns.barplot(x=train_data['Survived'],y = train_data['Age'],hue = train_data['Pclass'])


In [None]:
import seaborn as sns
sns.barplot(x = "Sex", y = "Survived", data = train_data, errorbar=('ci', 0))

In [None]:
#model imports
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, validation_curve, learning_curve
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
import matplotlib.pyplot as plt

In [None]:
def valid_curve(model, x,y, folds,pname, plist):
    train_score, valid_score = validation_curve(model, x, y, cv = folds, scoring = 'accuracy',param_name=pname, param_range=plist)
    train_means = np.mean(train_score, axis=1)
    valid_means=np.mean(valid_score,axis=1)
    
    plt.plot(plist, train_means, label = "Training Accuracy", color = 'b')
    plt.plot(plist, valid_means, label = "Validation Accuracy", color = 'g')
    plt.legend(loc = 'best')
    plt.title('Validation Curve')
    plt.xlabel(pname)
    plt.ylabel('Accuracy')
    plt.tight_layout()
    plt.show()
    
def learn_curve(model, x, y, folds=5, size_list=[100,200,300,400,500,600]):
    tsizes,train_score, valid_score = learning_curve(model, x, y,train_sizes = size_list)
    train_means = np.mean(train_score, axis = 1)
    valid_means = np.mean(valid_score, axis = 1)

    plt.plot(tsizes, train_means, label = "Training Accuracy", color = 'b')
    plt.plot(tsizes, valid_means, label = "Validation Accuracy", color = 'g')
    plt.legend(loc = 'best')
    plt.title('Learning Curve as accuracy over training sample size')
    plt.xlabel('Training samples')
    plt.ylabel('Accuracy')
    plt.tight_layout()
    plt.show()    

In [None]:
#data splitting and processing
y = train_data['Survived']
x = train_data.copy()
x = x.drop(['Survived'], axis = 1)
#print(x.dtypes)
#print(y.dtypes)
x_train, x_valid, y_train, y_valid = train_test_split(x,y,test_size = 0.3)

In [None]:
x_without = x.drop(['Name','PassengerId'], axis = 1)
#cat_columns = [c for c in x.columns if x.dtypes[c] == object]
#print(cat_columns)

In [None]:
#defining pipelines
cat_encoding = OneHotEncoder(handle_unknown = 'ignore')
age_imputer = SimpleImputer(strategy = 'mean')
cabin_imputer = SimpleImputer(strategy = 'constant', fill_value = 'NaN')
embarked_imputer = SimpleImputer(strategy = 'constant', fill_value = 'None')

cabin_pipeline = Pipeline(steps=[('cabimp', cabin_imputer),('cabhot', cat_encoding)])
embarked_pipeline = Pipeline(steps=[('embimp', embarked_imputer),('embhot', cat_encoding)])

cat_columns = ['Name','Sex','Ticket']
cat_columns2 = ['Sex','Ticket']
trans = ColumnTransformer(transformers = [('age', age_imputer, ['Age']),('cabin', cabin_pipeline, ['Cabin']), ('embarked', embarked_pipeline, ['Embarked']), ('onehot', cat_encoding, cat_columns)])
#cat_columns = [c for c in x.columns if x.dtypes[c] == object]
trans_without = ColumnTransformer(transformers = [('age', age_imputer, ['Age']),('cabin', cabin_pipeline, ['Cabin']), ('embarked', embarked_pipeline, ['Embarked']), ('onehot', cat_encoding, cat_columns2)])

In [None]:
'''import sklearn
print(sklearn.metrics.get_scorer_names())
'''

In [None]:
'''model = RandomForestClassifier(n_estimators = 100)
pipe = Pipeline(steps = [('pre', trans_without), ('model', model)])


score = cross_val_score(pipe, X = x_without, y = y, cv = 5, scoring= 'accuracy')
print(score.mean())'''

In [None]:
'''
model = RandomForestClassifier(n_estimators = 100)
pipe = Pipeline(steps = [('pre', trans_without), ('model', model)])
pipe.fit(x_without,y)
passengers = test_data['PassengerId']
test_data = test_data.drop(['PassengerId','Name'],axis=1)
predictions = pipe.predict(test_data)
output = pd.DataFrame({'PassengerId':passengers, 'Survived':predictions})
output.to_csv('submission.csv',index = False)
'''

In [None]:
'''
model = RandomForestClassifier(n_estimators = 100)
pipe = Pipeline(steps = [('pre', trans), ('model', model)])
learn_curve(pipe, x, y, folds = 5,size_list = [100,200,300,400,500,600])
'''

In [None]:
'''mod = RandomForestClassifier()
encoded_x = trans.fit_transform(x)
#pipe = Pipeline(steps = [('pre', trans), ('model', mod)])
valid_curve(mod, encoded_x, y, folds=5, pname='n_estimators',plist=[5,10,20,50,100,200])'''

In [None]:
'''model = svm.SVC()
encoded_x = trans.fit_transform(x)
valid_curve(model, encoded_x, y, folds=5,pname = 'kernel', plist=['linear','sigmoid','rbf','polynomial'])
'''

In [None]:
'''
model = svm.SVC(kernel='linear')
pipe = Pipeline(steps = [('pre', trans), ('model', model)])
#learn_curve(pipe, x, y)
#score = cross_val_score(pipe, X = x, y = y, cv = 5, scoring= 'accuracy')
pipe2 = Pipeline(steps = [('pre', trans_without), ('model', model)])
#scores2 = cross_val_score(pipe2, X = x_without, y = y, cv = 5, scoring= 'accuracy')
#print(score.mean())
#print(scores2.mean())

pipe2.fit(x_without,y)
passengers = test_data['PassengerId']
test_without = test_data.drop(['PassengerId', 'Name'],axis=1)
predictions = pipe2.predict(test_without)
output = pd.DataFrame({'PassengerId': passengers, 'Survived': predictions})
output.to_csv('submission.csv', index = False)
'''

In [None]:
'''
#mutual info
from sklearn.feature_selection import mutual_info_classif

def plot_mi(scores):
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    
    plt.figure(dpi=100, figsize = (8,5))
    plt.show()

x2 = x.copy()

age_imputer = SimpleImputer(strategy = 'mean')
x2['Age'] = age_imputer.fit_transform(pd.DataFrame(x2['Age']))

for c in x2.select_dtypes("object"):
    x2[c], _ = x2[c].factorize()
    
disc_feats = x2.dtypes == int

scores = mutual_info_classif(x2,y, discrete_features=disc_feats)
scores = pd.Series(scores,name = "MI scores", index = x2.columns)
scores = scores.sort_values(ascending=False)

print(scores)
plot_mi(scores)
'''

In [None]:
'''from xgboost import XGBClassifier

#encoded_x = trans.fit_transform(x)
#encoded_xwithout = trans_without.fit_transform(x_without)
#model = XGBClassifier()
#valid_curve(model, encoded_xwithout, y, folds=5, pname='n_estimators',plist=[10, 20, 50, 75, 100,200])
model2 = XGBClassifier(n_estimators = 50)
#score = cross_val_score(model2, X = encoded_x, y = y, cv = 5, scoring= 'accuracy')
#score_without = score = cross_val_score(model2, X = encoded_xwithout, y = y, cv = 5, scoring= 'accuracy')
#print(score.mean())
#print(score_without.mean())
passengers = test_data['PassengerId']
pipe = Pipeline(steps = [('encode', trans),('model', model2)])
pipe.fit(x,y)

predictions = pipe.predict(test_data)

output = pd.DataFrame({'PassengerId': passengers, 'Survived': predictions})
output.to_csv('submission.csv',index = False)
'''

In [None]:
#import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def flow_model(prehead, inputs):
    body = keras.Sequential([
    layers.Dense(units=32, activation  = 'relu', input_shape = [processed_x.shape[1]]),
    layers.Dense(units = 32, activation = 'relu'),
    layers.Dense(units = 32, activation = 'relu'),
    layers.Dense(units = 1)
    ])
    preinputs = prehead(inputs)
    result = body(preinputs)
    model = tf.keras.Model(inputs, result)
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['binary_accuracy'])
#try processed x for now, think about validation after initial run

#lookie = layers.StringLookup(output_mode = 'one_hot')
#lookie.adapt(x)

#changing the datatypes to tensorflow equivalents
inputs = {}
features = x_without.copy()
for name, column in features.items():
    d = column.dtype
    #string/object dtypes converted to string
    if d == object:
        d = tf.string
    #numeric columns converted to float32
    else:
        d = tf.float32
    inputs[name] = tf.keras.Input(shape=(1,), name = name, dtype=d)
    
    
numerics = {name:inp for name,inp in inputs.items() if inp.dtype==tf.float32} #contains all the data for numeric columns in a dictionary
tempx = layers.Concatenate()(list(numerics.values()))
normie = layers.Normalization()
normie.adapt(np.array(feature[numerics.keys()]))
numeric_inputs = normie(tempx)
preprocessed_inputs = [numeric_inputs] # this is where we will store our preprocessed data, since numerics columns are done they are placed here first

for name, col in features.items():
    if col.dtype == tf.float32:
        continue
    
    lookie = layers.StringLookup(vocabulary = np.unique())
    one_hot = layers.CategoryEncoding(num_tokens = lookie.vocabulary_size)
    temps = lookie(col)
    temps = one_hot(temps)
    preprocessed_inputs.append(temps)

#at this point preprocessed inputs should be complete




model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['binary_accuracy'])

history = model.fit(
    x = processed_x,
    y = y,
   # validation_split = 0.2,
    epochs = 20,
    verbose = 1,
)

history_frame = pd.DataFrame(history)
history_frame.loc[:,['loss', 'val_loss']].plot()
history_frame.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot()