In [178]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


In [179]:
#Taking the csv file data to store it in titanic data
#Priting the columns to choose the best features to work on it
titanic_data_file_path = '../input/titanic/train.csv'
titanic_data = pd.read_csv(titanic_data_file_path) 
titanic_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [180]:
################################################ Preprocssing ###########################################
#Making new dataframe its name is titanic_data_updated to hold only the coloumns that will be used in building the model
#after removing the redundant features
#then check if there is any missing values in the new dataframe
titanic_data_updated_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Survived']
titanic_data_updated = titanic_data[titanic_data_updated_features]
print(titanic_data_updated.isnull().sum())


Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Survived      0
dtype: int64


In [181]:
#There are 177 missing value in Age ,So it will be filled with the median of Ages in the dataframe
#Check again that there are no missing value after filling the missing values with median to make sure there are
#no more missing values
titanic_data_updated['Age'] = titanic_data_updated['Age'].fillna(titanic_data_updated['Age'].median())
print(titanic_data_updated.isnull().sum())



Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Survived    0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [182]:
#the output that will be predicted will be the "Survived" so it will be stored in y and the rest of features will
#be stored in X to fill the model with them

y = titanic_data_updated.Survived
titanic_data_working_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
X = titanic_data_updated[titanic_data_working_features]

In [183]:
#converting the the Sex coloumns to numeric value instead of string using LabelEncoder function because there
#are used models in this file can't deal with strings
stringToNum = preprocessing.LabelEncoder()
X['Sex']= stringToNum.fit_transform(X['Sex'])
X.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch
0,3,1,22.0,1,0
1,1,0,38.0,1,0
2,3,0,26.0,0,0
3,1,0,35.0,1,0
4,3,1,35.0,0,0


In [184]:
#Dividing the data in X and y to use some of them in trainnig the model and the some of them for testing the model
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [185]:
################################################ Model building,training and predicting ########
### Random forest Model
#building the Random forest Model using RandomForestClassifier function with
#random_state=0 to have the same result every times
#fitting the model with the data extracted before train_X for features and train_y for output using fit function
#predicting the output for Val_X extracted before using predict function
titanic_model_RandomForest = RandomForestClassifier(random_state=0)
titanic_model_RandomForest.fit(train_X, train_y)
RandomForest_predictions = titanic_model_RandomForest.predict(val_X)


In [186]:
### Decision Tree Model
#building the Random forest Model using DecisionTreeClassifier function with
#random_state=0 to have the same result every times
#fitting the model with the data extracted before train_X for features and train_y for output using fit function
#predicting the output for Val_X extracted before using predict function
titanic_model_DecisionTree = DecisionTreeClassifier(random_state=0)
titanic_model_DecisionTree.fit(train_X, train_y)
DecisionTree_predictions = titanic_model_DecisionTree.predict(val_X)


In [187]:
### Logistic Regression Model
#building the Random forest Model using LogisticRegression function with
#random_state=0 to have the same result every times
#fitting the model with the data extracted before train_X for features and train_y for output using fit function
#predicting the output for Val_X extracted before using predict function
titanic_model_LogisticRegression = LogisticRegression(random_state=0)
titanic_model_LogisticRegression.fit(train_X, train_y)
LogisticRegression_predictions = titanic_model_LogisticRegression.predict(val_X)


In [188]:
### SVC Model
#building the Random forest Model using SVC function with
#random_state=0 to have the same result every times
#fitting the model with the data extracted before train_X for features and train_y for output using fit function
#predicting the output for Val_X extracted before using predict function
titanic_model_SVC = SVC(random_state=0)
titanic_model_SVC.fit(train_X, train_y)
SVC_predictions = titanic_model_SVC.predict(val_X)


In [189]:
######################################### Evaluation #########################
#printing the mean square error for every model using val_y extracted before and the output of the predictions
print("RandomForest mean_absolute_error: ",mean_absolute_error(val_y, RandomForest_predictions))
print("DecisionTree_predictions mean_absolute_error: ",mean_absolute_error(val_y, DecisionTree_predictions))
print("LogisticRegression_predictions mean_absolute_error: ",mean_absolute_error(val_y, LogisticRegression_predictions))
print("SVC_predictions mean_absolute_error: ",mean_absolute_error(val_y, SVC_predictions))

RandomForest mean_absolute_error:  0.20179372197309417
DecisionTree_predictions mean_absolute_error:  0.21524663677130046
LogisticRegression_predictions mean_absolute_error:  0.20179372197309417
SVC_predictions mean_absolute_error:  0.34080717488789236


In [190]:
#we can find that RandomForest and logisticRegression model is the best models of the models used in this file
#so we will build another model with RandomForest type and fitting it with the whole data which is used before to train or
#test to train it with more data
#so it will be fitted with X , y 
titanic_model = RandomForestClassifier(random_state=1)
titanic_model.fit(X, y)

RandomForestClassifier(random_state=1)

In [191]:
#Getting tthe testing data from the file and store them in titanic_test_data
#building new dataframe its name is Xtest with only the features used to build the model and removing the rest
#checking if there are any missing data in the testing data
titanic_test_file_path = '../input/titanic/test.csv'
titanic_test_data = pd.read_csv(titanic_test_file_path)

titanic_test_data_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
Xtest = titanic_test_data[titanic_test_data_features]
print(Xtest.isnull().sum())



Pclass     0
Sex        0
Age       86
SibSp      0
Parch      0
dtype: int64


In [192]:
#found 86 missing data in Age feature so it will be filled with the median Age extracted from the training data
#converting the the Sex coloumns to numeric value instead of string using LabelEncoder function because the 
#RandomForest model can't deal with string type
#making sure that there are no more missing data with isnull().sum() function
Xtest['Sex']= stringToNum.fit_transform(Xtest['Sex'])
Xtest['Age'] = Xtest['Age'].fillna(titanic_data_updated['Age'].median())
print(Xtest.isnull().sum())


Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [193]:
#Testing the model using predict function and the input is Xtest which extracted from the test file data
titanic_passangers_test_predictions = titanic_model.predict(Xtest)

In [194]:
#creating a csv file to store the PassangerId feature and the predicted output for this passanger
predictionDataframeFinal = pd.DataFrame(columns=["PassengerId", "Survived"])
predictionDataframeFinal["PassengerId"] = titanic_test_data['PassengerId']
predictionDataframeFinal["Survived"] = titanic_passangers_test_predictions
predictionDataframeFinal.to_csv("predictionDataframeFinal.csv", index=False)
