In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
path = os.getcwd()+'../input/train.csv'

In [None]:
data = pd.read_csv('../input/train.csv')

In [None]:
print(data.head())

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
# percent of missing "Age" 
print('Percent of missing "Age" records is %.2f%%' %((data['Age'].isnull().sum()/data.shape[0])*100))

In [None]:
ax = data["Age"].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
data["Age"].plot(kind='density', color='teal')
ax.set(xlabel='Age')
plt.xlim(-10,85)
plt.show()

In [None]:
# mean age
print('The mean of "Age" is %.2f' %(data["Age"].mean(skipna=True)))
# median age
print('The median of "Age" is %.2f' %(data["Age"].median(skipna=True)))

In [None]:
print('Percent of missing "Cabin" records is %.2f%%' %((data['Cabin'].isnull().sum()/data.shape[0])*100))


## 2.3. Embarked - Missing Values

In [None]:
# percent of missing "Embarked" 
print('Percent of missing "Embarked" records is %.2f%%' %((data['Embarked'].isnull().sum()/data.shape[0])*100))

In [None]:
import seaborn as sns
print('Boarded passengers grouped by port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton):')
print(data['Embarked'].value_counts())
sns.countplot(x='Embarked', data=data, palette='Set2')
plt.show()

In [None]:
print('The most common boarding port of embarkation is %s.' %data['Embarked'].value_counts().idxmax())

## 2.4. Final Adjustments to Data (Train & Test)

In [None]:

data["Age"].fillna(data["Age"].median(skipna=True), inplace=True)
data["Embarked"].fillna(data['Embarked'].value_counts().idxmax(), inplace=True)
data.drop('Cabin', axis=1, inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.head()

In [None]:
plt.figure(figsize=(15,8))
ax = data["Age"].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
data["Age"].plot(kind='density', color='teal')
ax.legend(['Raw Age'])
ax.set(xlabel='Age')
plt.xlim(-10,85)
plt.show()

In [None]:
## Create categorical variable for traveling alone
data['TravelAlone']=np.where((data["SibSp"]+data["Parch"])>0, 0, 1)
data.drop('SibSp', axis=1, inplace=True)
data.drop('Parch', axis=1, inplace=True)

Also create categorical variables for Passenger Class ("Pclass"), Gender ("Sex"), and Port Embarked ("Embarked"). 

In [None]:
#create categorical variables and drop some variables
data=pd.get_dummies(data, columns=["Pclass","Embarked","Sex"])
data.drop('Sex_female', axis=1, inplace=True)
data.drop('PassengerId', axis=1, inplace=True)
data.drop('Name', axis=1, inplace=True)
data.drop('Ticket', axis=1, inplace=True)


data.head()

### Now, apply the same changes to the test data. <br>
I will apply to same imputation for "Age" in the Test data as I did for my Training data (if missing, Age = 28).  <br> I'll also remove the "Cabin" variable from the test data, as I've decided not to include it in my analysis. <br> There were no missing values in the "Embarked" port variable. <br> I'll add the dummy variables to finalize the test set.  <br> Finally, I'll impute the 1 missing value for "Fare" with the median, 14.45.

In [None]:
path = os.getcwd()+'../input/test.csv'
test_df = pd.read_csv('../input/test.csv')
test_data = test_df.copy()
test_data["Age"].fillna(data["Age"].median(skipna=True), inplace=True)
test_data["Fare"].fillna(data["Fare"].median(skipna=True), inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

test_data['TravelAlone']=np.where((test_data["SibSp"]+test_data["Parch"])>0, 0, 1)

test_data.drop('SibSp', axis=1, inplace=True)
test_data.drop('Parch', axis=1, inplace=True)

testing = pd.get_dummies(test_data, columns=["Pclass","Embarked","Sex"])
testing.drop('Sex_female', axis=1, inplace=True)
testing.drop('PassengerId', axis=1, inplace=True)
testing.drop('Name', axis=1, inplace=True)
testing.drop('Ticket', axis=1, inplace=True)

final_test = testing
final_test.head()

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [None]:
def cost(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    return np.sum(first - second) / (len(X))

In [None]:
cols = list(data.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('Survived')) #Remove b from list
data = data[cols+['Survived']] #Create new dataframe with columns in the order you want
data.head()

In [None]:
# add a ones column - this makes the matrix multiplication work out easier
data.insert(0, 'Ones', 1)

# set X (training data) and y (target variable)




# convert to numpy arrays and initalize the parameter array theta


In [None]:
cols = data.shape[1]
cols

In [None]:
X = data.iloc[:,0:cols-1]


In [None]:
X

In [None]:
Y = data.iloc[:,cols-1:cols]



In [None]:
Y.shape

In [None]:
X.shape

In [None]:
theta = np.zeros(11)

In [None]:
theta.shape

In [None]:
X.shape, theta.shape, Y.shape

In [None]:
cost(theta, X, Y)

In [None]:
def gradient(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    
    parameters = int(theta.ravel().shape[1])
    grad = np.zeros(parameters)
    
    error = sigmoid(X * theta.T) - y
    
    for i in range(parameters):
        term = np.multiply(error, X[:,i])
        grad[i] = np.sum(term) / len(X)
    
    return grad

In [None]:
theta.shape

In [None]:
import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, Y))
cost(result[0], X, Y)

In [None]:

theta_min = np.matrix(result[0])
X= np.matrix(X)
Y=np.matrix(Y)
X.shape,theta.shape, result[0].shape,theta_min.shape,theta_min.T.shape

In [None]:
def predict(theta, X):
    probability = sigmoid(X * theta.T)
    return [1 if x >= 0.5 else 0 for x in probability]

theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, Y)]
temp = sum(map(int,correct))
accuracy_test = temp/ len(correct)
print ('accuracy_test = {0}%'.format(accuracy_test*100))


In [None]:
Y.shape

In [None]:
final_test

In [None]:
final_test.insert(0, 'Ones', 1)

In [None]:
final_test.insert(11, 'Survived', 1)

In [None]:
X_test = final_test.iloc[:,0:cols-1]

In [None]:
X_test = np.matrix(X_test)

In [None]:
Y_test = final_test.iloc[:,cols-1:cols]

In [None]:
Y_test = np.matrix(Y_test)

In [None]:
final_test

In [None]:
predict(theta_min,X_test)

In [None]:
Survived_test = predict(theta_min,X_test)

In [None]:
Survived_test= pd.DataFrame(Survived_test)

In [None]:
Survived_test.head()

In [None]:
final_test['Survived'] = Survived_test

In [None]:
final_test['Survived'].count()


In [None]:
test_df['PassengerId'].count()

In [None]:
final_test['Survived'].head()

In [None]:
df1 = pd.DataFrame(test_df['PassengerId'])
df2= pd.DataFrame(final_test['Survived'])
concat = pd.merge(df1,df2, left_index=True, right_index = True)
concat.head()

In [None]:
concat.head()

In [None]:
concat.to_csv('concat.csv',index=False)