In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

### Read the data and save to a dataframe

In [None]:
data = pd.read_csv('../input/train.csv')

### Quick look at the data

In [None]:
data.head(10)

In [None]:
data.tail(10)

In [None]:
sns.pairplot(data,x_vars=['Age','Fare','Pclass'],y_vars='Survived',kind='reg',size=7)

In [None]:
data.columns

In [None]:
data.shape

### Check for any Null values 

In [None]:
data.isna().sum()

### Further look into the data

Look into the Age column, Cabin Column and the embarked columns. 

#### Age

In [None]:
data.groupby('Age').Survived.value_counts(dropna=False)

In [None]:
data.Age.describe()

In [None]:
data.Age.agg(['min','max','mean','std'])

In [None]:
data.Age.agg(['min','max','mean','std']).plot(kind = 'barh')

In [None]:
age_survival = data.loc[data.Survived == 1,'Age'].value_counts().sort_index().plot(figsize=(13,8))

age_survival.set_xlabel('Age')
age_survival.set_ylabel('Survival')

In [None]:
data.loc[(data['Survived']==1) & (data['Sex']=='female') & (data['Age'])]

In [None]:
sns.boxplot(x =data.Sex =='female',y=data['Survived'])

In [None]:
data.loc[data.Sex=='female','Survived'].value_counts()

In [None]:
data.loc[(data['Survived']==1) & (data['Sex']=='female') & (data['Age'])].mean()

In [None]:
sns.pairplot(data,x_vars='Age',y_vars='Survived',kind='reg',size=10)

#### Embarked

In [None]:
pd.crosstab(data.Survived,data.Embarked).plot(kind = 'bar')

In [None]:
data.Embarked.value_counts(dropna=False)

In [None]:
data['Embarked'] = data.Embarked.map({'S':0,'C':1,'Q':2})

In [None]:
data.Embarked.value_counts()

In [None]:
data['Embarked'] = data.Embarked.fillna(value = 0.0)

In [None]:
data.Embarked.value_counts(dropna=False)

In [None]:
data.Embarked.shape

In [None]:
data.head()

In [None]:
data['Embarked'].head()

In [None]:
data.Embarked.shape

In [None]:
data.Embarked.isna().sum()

In [None]:
sns.pairplot(data,x_vars='Embarked',y_vars='Survived',kind='reg',size=10)

#### Cabin

In [None]:
data.Cabin.value_counts().head()

In [None]:
data[(data.Survived ==1) & (data.Cabin)]

I have decided to Drop the Cabin Column because it has too many missing values 

### Categorizing the Age column and create dummy variables. 

In [None]:
data.loc[data['Age'] <= 15,'Age'] = 0 

data.loc[(data['Age'] > 15) & (data['Age'] <= 30), 'Age'] = 1

data.loc[(data['Age'] > 30) & (data['Age'] <= 50), 'Age'] = 2

data.loc[(data['Age'] > 50),'Age'] = 3

In [None]:
data.Age.head()

In [None]:
data.Age.isna().sum()

In [None]:
data.dropna(subset=['Age'],axis ='index',how='all',inplace=True)

In [None]:
data.Age.isna().sum()

In [None]:
data.Age.value_counts(dropna=False)

In [None]:
data.Age.isna().sum()

In [None]:
sns.pairplot(data,x_vars='Age',y_vars='Survived',size=10,kind='reg')

In [None]:
data.head(10)

In [None]:
data.Age.isna().sum()

### Create Dummy Variables for Sex Column

In [None]:
pd.crosstab(data.Survived,data.Sex).plot(kind='bar')

In [None]:
data['Sex'] = data.Sex.map({'female':0,'male':1})

In [None]:
data.Sex.values

In [None]:
data.head(10)

In [None]:
data.isna().sum()

In [None]:
data.head(100)

### Create a model using Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import seaborn as sns

In [None]:
feature_cols = ['Pclass','Sex','Age','Embarked','SibSp','Fare']

In [None]:
sns.pairplot(data,x_vars=feature_cols,y_vars = 'Survived',kind = 'reg',size = 4,aspect=0.9)

In [None]:
feature_cols = ['Pclass','Sex','Age','Embarked','SibSp','Fare']

X = data[feature_cols]

y = data.Survived

In [None]:
print(type(X))
print(X.shape)

In [None]:
print(type(y))
print(y.shape)

In [None]:
#Instantiate the Model

linreg = LinearRegression()

In [None]:
# Fit the model 

linreg.fit(X,y)

In [None]:
linreg.intercept_

In [None]:
linreg.coef_

In [None]:
feature_list = list(zip(feature_cols,linreg.coef_))

In [None]:
feature_list

### check the cross validation score. 

In [None]:
# 10 fold cross validation with all 4 features

linreg = LinearRegression()

score = cross_val_score(linreg,X,y,cv=10,scoring='neg_mean_squared_error')

In [None]:
score

In [None]:
# Make the scores +

msc_sc = -score
print(msc_sc)

In [None]:
# Calculate te RMSE

rmse = np.sqrt(msc_sc)
print(rmse)

In [None]:
## Print the mean of RMSE

print(rmse.mean())

### Make Predictions

In [None]:
# Load the test set 

test = pd.read_csv('../input/test.csv')

In [None]:
test.head()

#### Prepare the data. 

In [None]:
test['Sex'] = test.Sex.map({'female':0,'male':1})

In [None]:
test.Sex.value_counts()

In [None]:
test.Embarked.value_counts()

In [None]:
test['Embarked'] = test.Embarked.map({'S':0,'C':1,'Q':2})

In [None]:
test.Embarked.value_counts()

In [None]:
test.loc[test['Age'] <= 15,'Age'] = 0 

test.loc[(test['Age'] > 15) & (test['Age'] <= 30), 'Age'] = 1

test.loc[(test['Age'] > 30) & (test['Age'] <= 50), 'Age'] = 2

test.loc[(test['Age'] > 50),'Age'] = 3

In [None]:
test.isna().sum()

In [None]:
test.Age.dropna(axis='index',how='any',inplace=True)

In [None]:
test['Age'] = test.Age.isna().sum()

In [None]:
test.isna().sum()

### Select X

In [None]:
feature_cols = ['Pclass','Sex','Age','Embarked','SibSp']

X_test = test[feature_cols]

In [None]:
linreg = LinearRegression()

linreg.fit(X,y)

In [None]:
y_pred = linreg.predict(X)

In [None]:
y_pred[X_test]

## Fit and test a classification model. 

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
feature_cols = ['Pclass','Sex','Age','Embarked','SibSp','Fare']

X = data[feature_cols]

y = data.Survived

In [None]:
logreg = LogisticRegression()

logreg.fit(X,y)

### Check the cross Validation score

In [None]:
score = cross_val_score(logreg,X,y,cv=10,scoring='neg_mean_squared_error')

score

In [None]:
mse = - score

In [None]:
## NOw calculate the RMSE

rmse = np.sqrt(mse)
print(rmse)

### Predict from the loaded test set

In [None]:
test_cols = ['Pclass','Sex','Age','Embarked','SibSp','Fare']

X_test = test[test_cols]

In [None]:
X

In [None]:
X_test

In [None]:
X_test.isna().sum()

In [None]:
X_test.dtypes

In [None]:
X_test.Fare.fillna(X_test.Fare.mean(),inplace=True)

In [None]:
X_test.isna().sum()

In [None]:
logreg = LogisticRegression()

logreg.fit(X,y)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
y_pred

In [None]:
pd.get_option('display.max_rows')

In [None]:
pd.set_option('display.max_rows',None)

In [None]:
X_test.shape

In [None]:
test.PassengerId.shape

### Create the kaggle Submission file

In [None]:
# Create a pandas Dataframe

pd.DataFrame({'PasssngerId':test.PassengerId,'Survived':y_pred})

In [None]:
# now save PassengerId columns as the index

pd.DataFrame({'PasssngerId':test.PassengerId,'Survived':y_pred}).set_index('PasssngerId')

In [None]:
# Finally Convert the file to a CSV file 

pd.DataFrame({'PassengerId':test.PassengerId,'Survived':y_pred}).set_index('PassengerId').to_csv('Titaanic log reg2.csv')