In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mode
import titanic.data_preprocessing.preprocessing


In [2]:
#Fonction de chargement de la dataset
def read_data(file, separator):
    return pd.read_csv(file, sep=separator)

In [3]:
df = read_data("data/train.csv", ',')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
#nbr_lignes = df.shape[0]
#nbr_lignes

In [5]:
#fonction detecte les variables qui contiennent des valeurs manquantes
def features_nan_detect(nbr_entries):
    features_nan = []
    for i in df.columns:
        x=df[i].isna().sum()
        if x != 0:
            features_nan.append(i)
            print(i, "---Nombre", x)
            print(i, "---Proportion", x/nbr_entries * 100)
    return features_nan

In [6]:
features_tabl_nan = features_nan_detect(df.shape[0])
features_tabl_nan

Age ---Nombre 177
Age ---Proportion 19.865319865319865
Cabin ---Nombre 687
Cabin ---Proportion 77.10437710437711
Embarked ---Nombre 2
Embarked ---Proportion 0.22446689113355783


['Age', 'Cabin', 'Embarked']

In [7]:
#fonction replace les valeurs manquantes par mediane/mode
def features_nan_replace(features_tabl):
    for i in features_tabl:
        for index, row in df.iterrows():
            if pd.isna(row[i]):
                t = df[i].dtype
                if t in ['float64', 'int64']:
                    df.loc[index, i] = df[i].median()
                else :
                    df.loc[index, i] = df[i].mode().iloc[0]

In [8]:
features_nan_replace(features_tabl_nan)

In [9]:
d = features_nan_detect(df.shape[0])
d

[]

In [10]:
#On dumnifie les int dans le cas que ces derniers représentent des classes et non pas des valeurs entieres
#fonction de conversion en string
def features_convert_types(to_convert, type_convert):
    for i in to_convert:
        df[i] = df[i].apply(type_convert)

In [11]:
features_convert_types(['Pclass', 'Parch'], str)
df['Pclass'].dtypes

dtype('O')

In [12]:
#fonction de dumnification
def features_dumnify(x, to_dumnify):
    x = pd.get_dummies(x, columns = to_dumnify)
    return x

In [13]:
#fonction de génération de X et Y
def get_X_Y(x_features, y_feature):
    return df[x_features], df[y_feature]

In [14]:
features = ['Pclass', 'SibSp', 'Parch']
X, Y = get_X_Y(features, 'Survived')

In [15]:
X = features_dumnify(X, ['Parch', 'Pclass'])
X.head()

Unnamed: 0,SibSp,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Pclass_1,Pclass_2,Pclass_3
0,1,1,0,0,0,0,0,0,0,0,1
1,1,1,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0,0,0,1
3,1,1,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,1


In [16]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S


In [17]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression

In [18]:
model = LogisticRegression()

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [20]:
clf = model.fit(X_train, y_train)
score = clf.score(X_test, y_test)
score



0.7201492537313433

In [21]:
#Calcul de RMSE
from sklearn.metrics import mean_squared_error

In [22]:
#combined rmse value
y_predicted = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
print(rmse)

0.5290092118939487


In [23]:
X_test

Unnamed: 0,SibSp,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Pclass_1,Pclass_2,Pclass_3
495,0,1,0,0,0,0,0,0,0,0,1
648,0,1,0,0,0,0,0,0,0,0,1
278,4,0,1,0,0,0,0,0,0,0,1
31,1,1,0,0,0,0,0,0,1,0,0
255,0,0,0,1,0,0,0,0,0,0,1
298,0,1,0,0,0,0,0,0,1,0,0
609,0,1,0,0,0,0,0,0,1,0,0
318,0,0,0,1,0,0,0,0,1,0,0
484,1,1,0,0,0,0,0,0,1,0,0
367,0,1,0,0,0,0,0,0,0,0,1


In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
model1 = RandomForestClassifier()

In [33]:
def grid_search_params(model1, X, y):
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    max_features = ['auto', 'sqrt']
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
    max_depth.append(None)
    
    grid_search = {
     'n_estimators': n_estimators,
     'max_features': max_features,
     'min_samples_split': min_samples_split,
     'min_samples_leaf': min_samples_leaf,
     'max_depth': max_depth
     }
    
    gdsr_random = RandomizedSearchCV(estimator = model1, param_distributions = grid_search, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    gdsr_random.fit(X_train, y_train)
    
    print(gdsr_random.best_params_)

In [34]:
from sklearn.model_selection import RandomizedSearchCV

In [35]:
grid_search_params(model1, X, Y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter n_trees for estimator RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False). Check the list of available parameters with `estimator.get_params().keys()`.