In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('train.csv', header = 0, delimiter = ',')
test_data = pd.read_csv('test.csv', header = 0, delimiter = ',')

In [3]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [4]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


In [5]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

# change categorical data to quantitative
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [6]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


## Cleaning Data

In [7]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

#### Select columns only wanted

In [8]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [49]:
train_calc = train_data.iloc[:,[2,4,5,6,7,9,11]]
train_calc.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


#### Make categorical data quantitative

In [50]:
embarked_dict = {'C':0,
                'Q':1,
                'S':2}
sex_dict = {'male': 0,
           'female':1}

train_calc.replace({'Embarked': embarked_dict,
                   'Sex': sex_dict}, inplace = True)
train_calc.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_calc.replace({'Embarked': embarked_dict,


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,2.0
1,1,1,38.0,1,0,71.2833,0.0
2,3,1,26.0,0,0,7.925,2.0
3,1,1,35.0,1,0,53.1,2.0
4,3,0,35.0,0,0,8.05,2.0


#### Drop the 2 'Embarked' NA values

In [51]:
train_calc = train_calc[train_calc['Embarked'].notna()]

#### Replace 'nan' ages with Mean Age

In [52]:
train_calc['Age'] = train_calc['Age'].fillna(28.5)

#### Remove missing 'Embarked' from original data set

In [71]:
train_data[train_data['Embarked'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [55]:
train_no_embarkNA = train_data[train_data['Embarked'].notna()]

## Multiple Regression

In [56]:
import statsmodels.api as sm

model=sm.OLS(train_no_embarkNA['Survived'], train_calc)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,Survived,R-squared (uncentered):,0.574
Model:,OLS,Adj. R-squared (uncentered):,0.57
Method:,Least Squares,F-statistic:,169.6
Date:,"Thu, 31 Mar 2022",Prob (F-statistic):,1.53e-158
Time:,12:49:42,Log-Likelihood:,-455.19
No. Observations:,889,AIC:,924.4
Df Residuals:,882,BIC:,957.9
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Pclass,0.0075,0.013,0.590,0.555,-0.017,0.032
Sex,0.5799,0.029,19.878,0.000,0.523,0.637
Age,0.0018,0.001,2.061,0.040,8.76e-05,0.004
SibSp,-0.0391,0.014,-2.804,0.005,-0.067,-0.012
Parch,-0.0235,0.019,-1.212,0.226,-0.062,0.015
Fare,0.0022,0.000,7.147,0.000,0.002,0.003
Embarked,0.0285,0.017,1.692,0.091,-0.005,0.061

0,1,2,3
Omnibus:,41.283,Durbin-Watson:,1.932
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45.868
Skew:,0.548,Prob(JB):,1.1e-10
Kurtosis:,3.195,Cond. No.,136.0


## Updated Equation

In [57]:
train_calc.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

#### drop columns with pval > 0.05

In [58]:
train_calc = train_calc[['Sex', 'Age', 'SibSp', 'Fare']]

In [59]:
model=sm.OLS(train_no_embarkNA['Survived'], train_calc)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,Survived,R-squared (uncentered):,0.571
Model:,OLS,Adj. R-squared (uncentered):,0.569
Method:,Least Squares,F-statistic:,293.9
Date:,"Thu, 31 Mar 2022",Prob (F-statistic):,9.440000000000001e-161
Time:,12:51:55,Log-Likelihood:,-458.51
No. Observations:,889,AIC:,925.0
Df Residuals:,885,BIC:,944.2
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Sex,0.5822,0.028,20.782,0.000,0.527,0.637
Age,0.0035,0.001,6.419,0.000,0.002,0.005
SibSp,-0.0351,0.012,-2.865,0.004,-0.059,-0.011
Fare,0.0020,0.000,6.860,0.000,0.001,0.003

0,1,2,3
Omnibus:,40.228,Durbin-Watson:,1.931
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44.519
Skew:,0.532,Prob(JB):,2.15e-10
Kurtosis:,3.261,Cond. No.,128.0


## Test Dataset

#### Clean test_data

In [75]:
test_data = test_data[['Sex', 'Age', 'SibSp', 'Fare']]


86

In [76]:
train_calc['Age'] = train_calc['Age'].fillna(28.5)

In [77]:
# change categorical data to quantitative

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(train_data, y)
predictions = model.predict(test_data)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

ValueError: could not convert string to float: 'Braund, Mr. Owen Harris'

## Totals to reference

In [None]:
sum(train_data['Survived'])/len(train_data)