In [125]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

df = pd.read_csv('train.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [126]:
y = df.Fare

In [127]:
df.drop(['Fare'], axis=1, inplace=True)

In [128]:
df.drop(['Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0
...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0


In [129]:
colums_int_float = [name for name in df.columns if df[name].dtype in ['int64', 'float64']]

colums_int_float

['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp']

In [130]:
x = df['Age']

In [131]:
x = np.round(x.fillna(df['Age'].mean()), 2)

In [132]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cnt = 1

for train, test in kf.split(x, y):
    print(f'Fold: {cnt} Train_set: {len(train)}  Test_set: {len(test)}')
    cnt += 1

Fold: 1 Train_set: 712  Test_set: 179
Fold: 2 Train_set: 713  Test_set: 178
Fold: 3 Train_set: 713  Test_set: 178
Fold: 4 Train_set: 713  Test_set: 178
Fold: 5 Train_set: 713  Test_set: 178


In [143]:
def rmse(score):
    return f'rmse: {"%0.2f" % np.sqrt(-score)}'

In [144]:
score1 = cross_val_score(LinearRegression(), x.values.reshape(-1, 1), y, cv=kf, scoring='neg_mean_squared_error')

print(f'Score dor each fold: {score1}')
print(rmse(score1.mean()))

Score dor each fold: [-1524.83148414 -2074.58246996 -1988.59137761 -5262.76530096
 -1408.55007948]
rmse: 49.52


In [145]:
score2 = cross_val_score(DecisionTreeRegressor(random_state=42), x.values.reshape(-1, 1), y, cv=kf, scoring='neg_mean_squared_error')

print(f'Score dor each fold: {score1}')
print(rmse(score1.mean()))

Score dor each fold: [-1524.83148414 -2074.58246996 -1988.59137761 -5262.76530096
 -1408.55007948]
rmse: 49.52


In [146]:
score3 = cross_val_score(RandomForestRegressor(random_state=42), x.values.reshape(-1, 1), y, cv=kf, scoring='neg_mean_squared_error')

print(f'Score dor each fold: {score1}')
print(rmse(score1.mean()))

Score dor each fold: [-1524.83148414 -2074.58246996 -1988.59137761 -5262.76530096
 -1408.55007948]
rmse: 49.52


-----------------

In [154]:
max_depth = [1, 2, 3, 4, 5]

for val in max_depth:
    score4 = cross_val_score(DecisionTreeRegressor(max_depth=val, random_state=42), x.values.reshape(-1, 1), y, cv=kf, scoring='neg_mean_squared_error')
    print(f'For max depth: {val}')
    print(rmse(score4.mean()))

For max depth: 1
rmse: 49.62
For max depth: 2
rmse: 50.86
For max depth: 3
rmse: 50.80
For max depth: 4
rmse: 51.10
For max depth: 5
rmse: 51.29


In [156]:
estimators = [50, 100, 150, 200, 250, 300, 350]

for count in estimators:
    score5 = cross_val_score(RandomForestRegressor(n_estimators=count, random_state=42), x.values.reshape(-1, 1), y, cv=kf, scoring="neg_mean_squared_error")
    print(f'For estimators: {count}')
    print(rmse(score5.mean()))

For estimators: 50
rmse: 52.26
For estimators: 100
rmse: 52.44
For estimators: 150
rmse: 52.37
For estimators: 200
rmse: 52.34
For estimators: 250
rmse: 52.36
For estimators: 300
rmse: 52.37
For estimators: 350
rmse: 52.35
