### Import Libraries

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

### Import Datasets

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('//kaggle/input/titanic/test.csv')

In [None]:
## check train & test sets first 5 rows
print(train.head(5) , '\n\n')
print(test.head(5))

In [None]:
## check the shapes of the dataset
print(train.shape)
print(test.shape)

In [None]:
## check column names with data types
print(train.dtypes , '\n')
print(test.dtypes)

### Check and treat missing values

In [None]:
## check null/missing values
print(train.isnull().sum() , '\n')
print(test.isnull().sum())

In [None]:
## copy the dataset to new variables
train2 = train.copy()
test2 = test.copy()

In [None]:
## replace null values

train2['Age'] = train2['Age'].replace(np.NAN, train2['Age'].mean()) # replace by mean value
train2['Cabin'] = train2['Cabin'].replace(np.NAN, 'XX')
train2['Embarked'] = train2['Embarked'].replace(np.NAN, 'YY')

test2['Age'] = test2['Age'].replace(np.NAN, test2['Age'].mean()) # replace by mean value
test2['Cabin'] = test2['Cabin'].replace(np.NAN, 'XX')
test2['Fare'] = test2['Fare'].replace(np.NAN, test2['Fare'].mean())

In [None]:
## check no null/missing values
print(train2.isnull().sum() , '\n')
print(test2.isnull().sum())

### Data Processing

#### define action on variables

In [None]:
##check Data Dictionary at https://www.kaggle.com/competitions/titanic/data

In [None]:
# Survived :- ok
# Pclass :: Ticket class :: ordered categorical variable :- ok
# Name :- drop
# Sex :: unordered categorical variable :- do one-hot encoding
# Age :- scale
# SibSp :- scale
# Parch :- scale
# Ticket :- extract number only then scale
# Fare :- scale
# Cabin : extract first character and do on-hot encoding
# Embarked : do one-hot encoding

#### One-hot Encoding

In [None]:
## define the model
oHE = OneHotEncoder(sparse = False)

In [None]:
## Sex // train
print(train2['Sex'].value_counts().to_frame().sort_values('Sex'))
Sex1 = pd.DataFrame(oHE.fit_transform(train2[['Sex']]))
Sex1.columns = ['Sex-female', 'Sex-male']
print(Sex1.head(10))

In [None]:
## Sex // test
print(test2['Sex'].value_counts().to_frame().sort_values('Sex'))
Sex2 = pd.DataFrame(oHE.fit_transform(test2[['Sex']]))
Sex2.columns = ['Sex-female', 'Sex-male']
print(Sex2.head(10))

In [None]:
## Cabin // train
Cb1 = pd.DataFrame(train2['Cabin'].str.get(0))
print(Cb1.value_counts().to_frame().sort_values('Cabin'))
Cabin1 =pd.DataFrame(oHE.fit_transform(Cb1))
Cabin1.columns = ['Cabin-A', 'Cabin-B', 'Cabin-C', 'Cabin-D', 'Cabin-E', 'Cabin-F', 'Cabin-G', 'Cabin-T', 'Cabin-X']
print(Cabin1.head(10))

In [None]:
## Cabin // test
cb2 = pd.DataFrame(test2['Cabin'].str.get(0))
print(cb2.value_counts().to_frame().sort_values('Cabin'))
Cabin2 = pd.DataFrame(oHE.fit_transform(cb2))
Cabin2.columns = ['Cabin-A', 'Cabin-B', 'Cabin-C', 'Cabin-D', 'Cabin-E', 'Cabin-F', 'Cabin-G', 'Cabin-X']
print(Cabin2.head(10))

In [None]:
## Embarked // train
print(train2['Embarked'].value_counts().to_frame().sort_values('Embarked'))
Embarked1 = pd.DataFrame(oHE.fit_transform(train2[['Embarked']]))
Embarked1.columns = ['Embarked-C', 'Embarked-Q', 'Embarked-S', 'Embarked-YY']
print(Embarked1.head(10))

In [None]:
## Embarked // test
print(test2['Embarked'].value_counts().to_frame().sort_values('Embarked'))
Embarked2 = pd.DataFrame(oHE.fit_transform(test2[['Embarked']]))
Embarked2.columns = ['Embarked-C', 'Embarked-Q', 'Embarked-S']
print(Embarked2.head(10))

#### sandard Scaler

In [None]:
## extract only numbers from 'Ticket'
train2['Ticket_num'] = train2['Ticket'].astype('str').str.extractall('(\d+)').unstack().fillna('').sum(axis=1).astype(int)
test2['Ticket_num'] = test2['Ticket'].astype('str').str.extractall('(\d+)').unstack().fillna('').sum(axis=1).astype(int)
print (train2['Ticket_num'].head(5), '\n', test2['Ticket_num'].head(5))

In [None]:
## define the model
scl = StandardScaler()

In [None]:
## // train
scl_train = pd.DataFrame(scl.fit_transform(train2[['Pclass', 'Age', 'SibSp', 'Parch', 'Ticket_num', 'Fare']]))
scl_train.columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Ticket_num', 'Fare']
print(scl_train.head(5))

In [None]:
## // train
scl_test = pd.DataFrame(scl.fit_transform(test2[['Pclass', 'Age', 'SibSp', 'Parch', 'Ticket_num', 'Fare']]))
scl_test.columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Ticket_num', 'Fare']
print(scl_test.head(5))

### Combine column dataframes

In [None]:
Survived_train = pd.DataFrame(train2['Survived'])

In [None]:
train3 = pd.concat([Survived_train, Sex1, Cabin1, Embarked1, scl_train], axis =1).reset_index(drop = True)
print(train3.shape, '\n')
print(train3.head(5))

In [None]:
test3 = pd.concat([Sex2, Cabin2, Embarked2, scl_test], axis =1).reset_index(drop = True)
print(test3.shape, '\n')
print(test3.head(5))

In [None]:
## check if any null value produced
print(train3.isnull().sum(), '\nn')
print(test3.isnull().sum())

In [None]:
## replace four null values in 'Ticket_num'
train3['Ticket_num'] = train3['Ticket_num'].replace(np.NAN, train3['Ticket_num'].mean())
train3.isnull().sum()

### train ~ test(validation) split of processed train data (train3) 

In [None]:
X = train3.drop('Survived', axis = 1)
Y = train3['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25)

### Apply Logistic Regression

In [None]:
## define the model
LG = LogisticRegression(solver = 'liblinear', random_state = 0)
LG.fit(x_train, y_train)

In [None]:
## check coefficients
print('Coefficients :' , LG.coef_ , 'Intercept :', LG.intercept_)
print('Coefficient of determination : ', LG.score(x_train, y_train))

In [None]:
## predict on the validation set
y_pred = pd.Series(LG.predict(x_test))
y_pred.value_counts()

In [None]:
## confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm, '\n')
print(classification_report(y_test, y_pred, target_names = ['class 0', 'class 1']))

In [None]:
## visualize confusion matrix
cmp = sns.heatmap(cm, annot = True)

cmp.set_title('Confusion Matrix with labels \n')
cmp.set_xlabel('\n PREDICTED')
cmp.set_ylabel('\n ACTUAL')

cmp.xaxis.set_ticklabels(['0', '1'])
cmp.yaxis.set_ticklabels(['0', '1'])
plt.show()

### Apply the Logistic Regression model to the Test set (test3)

In [None]:
## compare columns in the train and test sets (train3 & test3)
print('Columns in both x_test & test3', x_test.columns.intersection(test3.columns), '\n')
print('Columns in x_test but not in test3', x_test.columns.difference(test3.columns), '\n')
print('Columns in test3 but not in x_test', test3.columns.difference(x_test.columns))

In [None]:
## create missing column in test3 with 0
test3['Cabin-T'] = 0
test3['Embarked-YY'] = 0

In [None]:
y_pred_Test = pd.Series(LG.predict(test3))
y_pred_Test.value_counts()

In [None]:
## predict on the Test set
id = test['PassengerId'].to_frame('PassengerId').reset_index(drop = True)
sv = y_pred_Test.to_frame('Survived').reset_index(drop = True)

df_LG = pd.concat([id, sv], axis = 1)
print(df_LG )

In [None]:
# create the .csv file
df_LG.to_csv("kaggale_titanic_LG_v2.csv", index = False)