#TITANIC COMPETITION


In [205]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, validation_curve
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import f1_score, precision_score,recall_score, precision_recall_curve,roc_curve,roc_auc_score,confusion_matrix,classification_report,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier


In [206]:
df = pd.read_csv('train.csv')
df['Sex'] = df['Sex'].replace(['female','male'],[0,1])
df['Age'].fillna(method = 'pad',inplace = True)
df.drop(['Cabin','Name','Ticket'],axis = 1 , inplace = True)
df['Embarked'] = df['Embarked'].replace(['S','C','Q'],[1,2,3])
df.dropna(inplace = True)
X = df.iloc[:,2:]
Y = df['Survived']

In [207]:
##PREPARE DATA

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train_scaled, X_test_scaled , y_train, y_test = train_test_split(X_scaled,Y,random_state = 0)
X_train, X_test, y_train,y_test = train_test_split(X,Y,random_state = 0)

In [249]:
results_scaled = []

def bestmodel_scaled():
    params_logistic = {'C' : [0.01,0.05,0.1,0.5,1,3,5,7]}
    params_svm = {'C':[0.01,0.05,0.1,0.5,1,3,5],'gamma':[0.001,0.005,0.01,0.05,0.1,0.3,0.5,1,3,5],'kernel':['rbf','linear']}
    parameters_scaled = [params_logistic,params_svm]
    models_scaled = [LogisticRegression(),SVC()]
    for model_sc,params_sc in zip(models_scaled,parameters_scaled):
        final_model_sc = GridSearchCV(model_sc,params_sc,scoring= 'accuracy').fit(X_train_scaled,y_train)
        prediction_scaled = final_model_sc.predict(X_test_scaled)
        score_scaled = accuracy_score(prediction_scaled,y_test)
        results_scaled.append((model_sc,final_model_sc.best_params_,final_model_sc.best_score_,score_scaled))

    return results_scaled
bestmodel_scaled()

[(LogisticRegression(), {'C': 3}, 0.8153742565368646, 0.7488789237668162),
 (SVC(),
  {'C': 1, 'gamma': 5, 'kernel': 'rbf'},
  0.834822129951745,
  0.7847533632286996)]

In [250]:
results = []
def bestmodel():
    params_tree = {'max_features':[3,5,7]}
    params_grandientBoost = {'learning_rate':[0.01,0.05,0.1,0.5,1,2],'n_estimators':[100,150,200]}
    params_randomForest = {'n_estimators':[100,150,200]}
    parameters = [params_tree,params_grandientBoost,params_randomForest]
    models = [DecisionTreeClassifier(),GradientBoostingClassifier(),RandomForestClassifier()]
    for model,params in zip(models,parameters):
        final_model = GridSearchCV(model,params,scoring='accuracy').fit(X_train,y_train)
        prediction = final_model.predict(X_test)
        score = accuracy_score(prediction,y_test)
        results.append((model,final_model.best_params_,final_model.best_score_,score))

    return results
bestmodel()

[(DecisionTreeClassifier(),
  {'max_features': 5},
  0.7957917181012232,
  0.7488789237668162),
 (GradientBoostingClassifier(),
  {'learning_rate': 0.1, 'n_estimators': 200},
  0.8363595556054314,
  0.7802690582959642),
 (RandomForestClassifier(),
  {'n_estimators': 200},
  0.8258332398159579,
  0.7713004484304933)]

In [248]:
final_model = GradientBoostingClassifier(learning_rate=0.1,n_estimators=200).fit(X_train,y_train)
predict = final_model.predict(X_test)
score = accuracy_score(y_test,predict)
score

0.7802690582959642

In [283]:
test_data = pd.read_csv('test.csv')
test_data['Sex'] = test_data['Sex'].replace(['female','male'],[0,1])
test_data.fillna(method = 'pad',inplace = True)
test_data.drop(['Cabin','Name','Ticket'],axis = 1 , inplace = True)
test_data['Embarked'] = test_data['Embarked'].replace(['S','C','Q'],[1,2,3])
#test_data.dropna(inplace = True)
X_test_data = test_data.iloc[:,1:]
final_pred = final_model.predict(X_test_data)


In [303]:
final_df = pd.DataFrame(test_data['PassengerId'])
final_df['Survived'] = final_pred
final_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [294]:
gender = pd.read_csv('gender_submission.csv')
gender

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [304]:
final_df.to_csv('my_submission.csv', index=False)