In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv((
    "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
    "master/ESS_practice_data/ESSdata_Thinkful.csv")).dropna()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8147 entries, 0 to 8593
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cntry    8147 non-null   object 
 1   idno     8147 non-null   float64
 2   year     8147 non-null   int64  
 3   tvtot    8147 non-null   float64
 4   ppltrst  8147 non-null   float64
 5   pplfair  8147 non-null   float64
 6   pplhlp   8147 non-null   float64
 7   happy    8147 non-null   float64
 8   sclmeet  8147 non-null   float64
 9   sclact   8147 non-null   float64
 10  gndr     8147 non-null   float64
 11  agea     8147 non-null   float64
 12  partner  8147 non-null   float64
dtypes: float64(11), int64(1), object(1)
memory usage: 891.1+ KB


In [4]:
df.partner.value_counts()

1.0    5013
2.0    3134
Name: partner, dtype: int64

In [5]:
y = df.partner - 1

In [13]:
X = df.loc[:,~df.columns.isin(['cntry','idno','partner'])].copy()

In [14]:
X = pd.concat([X,pd.get_dummies(df.cntry)],axis=1)

In [16]:
X.head()

Unnamed: 0,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,agea,CH,CZ,DE,ES,NO,SE
0,6,3.0,3.0,10.0,5.0,8.0,5.0,4.0,2.0,60.0,1,0,0,0,0,0
1,6,6.0,5.0,7.0,5.0,9.0,3.0,2.0,2.0,59.0,1,0,0,0,0,0
2,6,1.0,8.0,8.0,8.0,7.0,6.0,3.0,1.0,24.0,1,0,0,0,0,0
3,6,4.0,6.0,6.0,7.0,10.0,6.0,2.0,2.0,64.0,1,0,0,0,0,0
4,6,5.0,6.0,7.0,5.0,8.0,7.0,2.0,2.0,55.0,1,0,0,0,0,0


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 675)

In [20]:
estimator = GradientBoostingClassifier()

In [24]:
params = {
    'loss':['deviance', 'exponential'],
    'learning_rate':[0.001,0.01,0.1],
    'max_depth':[2,3,5],
    'subsample':[0.8,1]
}

In [25]:
grid_estimator = GridSearchCV(estimator = estimator, param_grid=params)

In [26]:
grid_estimator.fit(X_train,y_train)

GridSearchCV(estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.001, 0.01, 0.1],
                         'loss': ['deviance', 'exponential'],
                         'max_depth': [2, 3, 5], 'subsample': [0.8, 1]})

In [27]:
grid_estimator.best_params_

{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'subsample': 0.8}

In [28]:
grid_estimator.score(X_test,y_test)

0.7533742331288343