### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

### Importing Data

In [3]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
test_out = pd.read_csv("data/gender_submission.csv")

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
X_train = train.drop(columns=["Survived"])  # features
y_train = train["Survived"].copy()         # target


In [6]:
#dropping from training data
X_train = X_train.drop(columns=["Name","Ticket"],axis=1)

In [7]:
#dropping form test data
test = test.drop(columns=["Name","Ticket"],axis=1)

In [8]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,7.8292,,Q
1,893,3,female,47.0,1,0,7.0,,S
2,894,2,male,62.0,0,0,9.6875,,Q
3,895,3,male,27.0,0,0,8.6625,,S
4,896,3,female,22.0,1,1,12.2875,,S


In [9]:
X_train.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [11]:


X_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,3,male,22.0,1,0,7.25,,S
1,2,1,female,38.0,1,0,71.2833,C85,C
2,3,3,female,26.0,0,0,7.925,,S
3,4,1,female,35.0,1,0,53.1,C123,S
4,5,3,male,35.0,0,0,8.05,,S


In [12]:
combined = pd.concat([X_train,test],ignore_index=False)
combined.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,3,male,22.0,1,0,7.25,,S
1,2,1,female,38.0,1,0,71.2833,C85,C
2,3,3,female,26.0,0,0,7.925,,S
3,4,1,female,35.0,1,0,53.1,C123,S
4,5,3,male,35.0,0,0,8.05,,S


In [13]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Sex          1309 non-null   object 
 3   Age          1046 non-null   float64
 4   SibSp        1309 non-null   int64  
 5   Parch        1309 non-null   int64  
 6   Fare         1308 non-null   float64
 7   Cabin        295 non-null    object 
 8   Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(3)
memory usage: 102.3+ KB


In [14]:
from sklearn.impute import SimpleImputer 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [15]:
num_att = ["Age","Fare","Sibsp","Parch","Pclass","PassengerId"]
cat_att = ["Sex","Embarked","Cabin","Name","Ticket"]

num_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder()
)

from sklearn.compose import ColumnTransformer

preprcessing = ColumnTransformer([
    (num_pipeline,num_att),
    (cat_pipeline,cat_att)
])

In [16]:
from sklearn.compose import make_column_transformer , make_column_selector

preprocessing = make_column_transformer(
    (num_pipeline,make_column_selector(dtype_include=np.number)),
    (cat_pipeline,make_column_selector(dtype_include=object))
)

In [17]:
X_train_prep = preprocessing.fit_transform(X_train)

In [18]:
X_train_prep.shape

(891, 158)

In [19]:
X_train_prep

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7841 stored elements and shape (891, 158)>

In [20]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=42)
model = rfc.fit(X_train_prep,y_train)

In [21]:
model.predict(X_train_prep)

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [22]:
y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [23]:
X_train

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,3,male,22.0,1,0,7.2500,,S
1,2,1,female,38.0,1,0,71.2833,C85,C
2,3,3,female,26.0,0,0,7.9250,,S
3,4,1,female,35.0,1,0,53.1000,C123,S
4,5,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,887,2,male,27.0,0,0,13.0000,,S
887,888,1,female,19.0,0,0,30.0000,B42,S
888,889,3,female,,1,2,23.4500,,S
889,890,1,male,26.0,0,0,30.0000,C148,C


In [24]:
test

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,7.8292,,Q
1,893,3,female,47.0,1,0,7.0000,,S
2,894,2,male,62.0,0,0,9.6875,,Q
3,895,3,male,27.0,0,0,8.6625,,S
4,896,3,female,22.0,1,1,12.2875,,S
...,...,...,...,...,...,...,...,...,...
413,1305,3,male,,0,0,8.0500,,S
414,1306,1,female,39.0,0,0,108.9000,C105,C
415,1307,3,male,38.5,0,0,7.2500,,S
416,1308,3,male,,0,0,8.0500,,S


In [25]:
test_prep = preprocessing.fit_transform(test)

In [26]:
model.predict(test_prep)

ValueError: X has 87 features, but RandomForestClassifier is expecting 158 features as input.

In [None]:
combined_pre = preprocessing.fit_transform(combined)

In [None]:
combined_pre.toarray()

array([[-1.73072813,  0.84191642, -0.61197171, ...,  0.        ,
         0.        ,  1.        ],
       [-1.72808175, -1.54609786,  0.63043107, ...,  1.        ,
         0.        ,  0.        ],
       [-1.72543538,  0.84191642, -0.30137101, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.72543538,  0.84191642,  0.66925616, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.72808175,  0.84191642,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.73072813,  0.84191642,  0.        , ...,  1.        ,
         0.        ,  0.        ]], shape=(1309, 197))

In [None]:
combined_pre.shape

(1309, 197)

In [None]:
X_train = combined_pre[:891,:]

In [None]:
X_train.shape

(891, 197)

In [None]:
test = combined_pre[891:,:]
test.shape

(418, 197)

In [None]:
model.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
prediction = model.predict(test)

In [None]:
pred = pd.Series(prediction)
pred

0      0
1      0
2      0
3      0
4      0
      ..
413    0
414    1
415    0
416    0
417    1
Length: 418, dtype: int64

In [None]:
passe = pd.read_csv("data/gender_submission.csv")

In [None]:
passe

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
passe["Survived"] = pred

In [None]:
passe

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
passe.to_csv("Submission.csv",index=False)

In [None]:
from sklearn.metrics import cross_val_score

cross_val_score(rfc,X_train_prep,y_train,cv=3,method="accuracy")


ImportError: cannot import name 'cross_val_score' from 'sklearn.metrics' (c:\Users\ASUS\anaconda3\envs\ml_env\Lib\site-packages\sklearn\metrics\__init__.py)