In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('./data/train.csv', ',')

### Train-Test split


In [3]:
X=df.drop('Survived', axis=1)

In [4]:
y=df['Survived']

In [5]:
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42)

In [6]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((668, 11), (668,), (223, 11), (223,))

### Feature Engineeting

In [7]:
#checking for NA values in Pclass
X_train['Pclass'].isna().sum()

0

In [14]:
def feature_engineer(df):
    #filling NA values of Age with category means
    category_means = df.groupby('Sex')['Age'].transform(np.mean)  
    df['Age'] = df['Age'].fillna(category_means)  
    
    #splitting Age into bins  
    df['Age_category'] = pd.cut(x=df['Age'], bins=[0.0, 18.0, 35.0, 55.0, 120.0], labels=[1, 2, 3, 4])
    
#     #adding a new variable Age_family
#     df['Family']=df['SibSp']+df['Parch']
#     df['Age_family']=df['Age_category'].astype(int)*df['Family']
    
    #transforming Sex column into categorical value
    df.loc[df['Sex'] == 'male', 'Sex'] = 1
    df.loc[df['Sex'] == 'female', 'Sex'] = 0
    
    #adding a new variable SexPclass
    df['SexPclass']=df['Sex']*df['Pclass']
    fe_df=df[['Age_category','SibSp','Parch','Sex','Pclass']]
    
    return fe_df

In [15]:
X_train_fe=feature_engineer(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = df['Age'].fillna(category_means)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age_category'] = pd.cut(x=df['Age'], bins=[0.0, 18.0, 35.0, 55.0, 120.0], labels=[1, 2, 3, 4])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SexPclass']=df['Sex']*df['Pclass']



### Train Logistic Regression Model

In [25]:
m=LogisticRegression(C=0.1)

In [26]:
m.fit(X_train_fe, y_train)

LogisticRegression(C=0.1)

In [27]:
m.score(X_train_fe,y_train) #training accuracy

0.7889221556886228

### Optimize/Cross-Validation


In [29]:
model = LogisticRegression()
accuracy = cross_val_score(model, X_train_fe, y_train, cv=5, scoring='accuracy')
accuracy.mean()

0.7888789137021659


### Test-score for Logistic regression

In [30]:
X_test_fe=feature_engineer(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = df['Age'].fillna(category_means)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age_category'] = pd.cut(x=df['Age'], bins=[0.0, 18.0, 35.0, 55.0, 120.0], labels=[1, 2, 3, 4])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a

In [31]:
m.score(X_test_fe,y_test)

0.8251121076233184

### Decision Tree Classifier

#### Training the model


In [32]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

n = DecisionTreeClassifier(max_depth=2)
n.fit(X_train_fe, y_train)


DecisionTreeClassifier(max_depth=2)

#### Make predictions

In [33]:
ypred = n.predict(X_test_fe)
ypred

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0])

In [34]:
n.score(X_train_fe, y_train) 

0.7889221556886228

In [35]:
n.score(X_test_fe, y_test) 

0.7802690582959642

### Random Forest


In [36]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
# rf=RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=42, n_jobs=-1)
rf=RandomForestClassifier(bootstrap= True, max_depth= 4, max_features= 'sqrt', min_samples_leaf=3, min_samples_split= 2, n_estimators=300)
# {'max_depth': 4, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 300}

In [80]:
rf.fit(X_train_fe, y_train)

RandomForestClassifier(max_depth=4, max_features='sqrt', min_samples_leaf=3,
                       n_estimators=300)

In [81]:
rf.score(X_train_fe, y_train)

0.8308383233532934

In [82]:
rf.score(X_test_fe, y_test)

0.8251121076233184

### Optimizing Hyperparameters for Random Forest

#### Grid Search

In [78]:
from sklearn.model_selection import GridSearchCV
rf_params = { 
    'n_estimators': [300, 1000],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [3, 5],
    'max_depth': np.arange(3,10)}

gs_random = GridSearchCV(estimator=rf, param_grid=rf_params, cv= 3)
gs_random.fit(X_train_fe, y_train)
print(gs_random.best_params_)    

{'max_depth': 4, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 300}


#### Randomized Search

In [30]:
from sklearn.model_selection import RandomizedSearchCV

gs_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_params, cv= 5,n_iter=50)

gs_random.fit(X_train_fe, y_train)
print(gs_random.best_params_)



{'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 15, 'bootstrap': True}


### Kaggle

In [31]:
X_kaggle= pd.read_csv('/Users/madinamukhambekova/Desktop/spearmint-vector-student-code/Week_02/data/predict.csv', ',',index_col=0)

In [32]:
X_kaggle.head(2)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [34]:
X_kaggle_fe=feature_engineer(X_kaggle)

In [40]:
y_kaggle = rf.predict(X_kaggle_fe)

In [41]:
output = pd.DataFrame({'datetime': X_kaggle_fe.index, 'count': y_kaggle})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!
