# Blending and Stacking

In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, linear_model, metrics, decomposition, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## Getting the dataset

In [2]:
df = pd.read_csv('../input/kumarmanoj-bag-of-words-meets-bags-of-popcorn/labeledTrainData.tsv', sep='\t', encoding='ISO-8859-1')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
len(df)

25000

## Creating folds

In [4]:
df.loc[:, 'kfold'] = -1 
df = df.sample(frac=1.0).reset_index(drop=True)
y = df['sentiment'].values

skf = model_selection.StratifiedKFold(n_splits=5)
for f, (t_, v_) in enumerate(skf.split(X=df, y=y)):
    df.loc[v_, "kfold"] = f

# df.to_csv('../input/train_folds.csv', index=False)

In [5]:
df.kfold.value_counts()

0    5000
1    5000
2    5000
3    5000
4    5000
Name: kfold, dtype: int64

## Train basemodels

### First model: Logistic regression + TF-IDF

In [6]:
def run_training(fold):
    """Train logistic regression model on OOF data.
    Then, predict on the fold."""
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    tfv = TfidfVectorizer(max_features=1000)
    tfv.fit(df_train.review.values)
    
    xtrain = tfv.transform(df_train.review.values)
    xvalid = tfv.transform(df_valid.review.values)
    
    ytrain = df_train.sentiment.values
    yvalid = df_valid.sentiment.values
    
    clf = linear_model.LogisticRegression()
    clf.fit(xtrain, ytrain)
    pred = clf.predict_proba(xvalid)[:, 1]
    
    auc = metrics.roc_auc_score(yvalid, pred)
    print(f"fold={fold}, auc={auc}")
    
    df_valid.loc[:, "lr_pred"] = pred
    
    return df_valid[["id", "sentiment", "kfold", "lr_pred"]]

In [7]:
# predict on each fold -> combine predictions into single column
dfs = []
for j in range(5):
    temp_df = run_training(j)
    dfs.append(temp_df)
    
fin_valid_df = pd.concat(dfs)

fold=0, auc=0.9365129600000001
fold=1, auc=0.9314710399999999
fold=2, auc=0.9360481599999999
fold=3, auc=0.93625488
fold=4, auc=0.9354478400000001


In [8]:
print(fin_valid_df.shape)

(25000, 4)


### Second model: LR + CountVectorizer

In [9]:
def run_training_2(fold):
    """Train logistic regression model on OOF data.
    Then, predict on the fold."""
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    tfv = CountVectorizer()
    tfv.fit(df_train.review.values)
    
    xtrain = tfv.transform(df_train.review.values)
    xvalid = tfv.transform(df_valid.review.values)
    
    ytrain = df_train.sentiment.values
    yvalid = df_valid.sentiment.values
    
    clf = linear_model.LogisticRegression(solver='liblinear')
    clf.fit(xtrain, ytrain)
    pred = clf.predict_proba(xvalid)[:, 1]
    
    auc = metrics.roc_auc_score(yvalid, pred)
    print(f"fold={fold}, auc={auc}")
    
    df_valid.loc[:, "lr_cnt_pred"] = pred
    
    return df_valid[["id", "sentiment", "kfold", "lr_cnt_pred"]]

In [10]:
dfs = []
for j in range(5):
    temp_df = run_training_2(j)
    dfs.append(temp_df)
    
fin_valid_df_2 = pd.concat(dfs)

fold=0, auc=0.9490720000000001
fold=1, auc=0.9414736
fold=2, auc=0.94314688
fold=3, auc=0.9461662399999999
fold=4, auc=0.9474937600000001


In [11]:
fin_valid_df_2.head()

Unnamed: 0,id,sentiment,kfold,lr_cnt_pred
0,3202_10,1,0,0.456801
1,10688_1,0,0,0.014995
2,6946_10,1,0,0.999881
3,6201_1,0,0,0.099811
4,6392_3,0,0,0.18918


### Third model: RF + SVD

In [12]:
def run_training_3(fold):
    """Train logistic regression model on OOF data.
    Then, predict on the fold."""
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    tfv = TfidfVectorizer()
    tfv.fit(df_train.review.values)
    
    xtrain = tfv.transform(df_train.review.values)
    xvalid = tfv.transform(df_valid.review.values)
    
    # transform to right singular vectors
    svd = decomposition.TruncatedSVD(n_components=120)
    svd.fit(xtrain)
    xtrain_svd = svd.transform(xtrain)
    xvalid_svd = svd.transform(xvalid)
    
    ytrain = df_train.sentiment.values
    yvalid = df_valid.sentiment.values
    
    clf = ensemble.RandomForestClassifier(n_estimators=100, n_jobs=-1)
    clf.fit(xtrain_svd, ytrain)
    pred = clf.predict_proba(xvalid_svd)[:, 1]
    
    auc = metrics.roc_auc_score(yvalid, pred)
    print(f"fold={fold}, auc={auc}")
    
    df_valid.loc[:, "rf_svd_pred"] = pred
    
    return df_valid[["id", "sentiment", "kfold", "rf_svd_pred"]]

In [13]:
dfs = []
for j in range(5):
    temp_df = run_training_3(j)
    dfs.append(temp_df)
    
fin_valid_df_3 = pd.concat(dfs)

fold=0, auc=0.8786821599999999
fold=1, auc=0.8775124
fold=2, auc=0.8816426399999999
fold=3, auc=0.8827328
fold=4, auc=0.8793898400000001


Check correlation:

In [14]:
pd.DataFrame(np.c_[fin_valid_df_3.rf_svd_pred, fin_valid_df_2.lr_cnt_pred, fin_valid_df.lr_pred]).corr()

Unnamed: 0,0,1,2
0,1.0,0.724521,0.82746
1,0.724521,1.0,0.884288
2,0.82746,0.884288,1.0


## Blending

Here Abishek uses `glob` to get all files, since he saved all DataFrames on disk. 

```python
files = glob.glob("../model_preds/*.csv")
df = None
for f in files:
    if df is None:
        df = pd.read_csv(f)
    else:
        temp_df = pd.read_csv(f)
        df = df.merge(temp_df, on="id", how="left")
```

In [15]:
df_blending = fin_valid_df.merge(fin_valid_df_2, on="id", how="left").merge(fin_valid_df_3, on="id", how="left")

In [16]:
df_blending

Unnamed: 0,id,sentiment_x,kfold_x,lr_pred,sentiment_y,kfold_y,lr_cnt_pred,sentiment,kfold,rf_svd_pred
0,3202_10,1,0,0.520567,1,0,4.568007e-01,1,0,0.40
1,10688_1,0,0,0.061593,0,0,1.499460e-02,0,0,0.40
2,6946_10,1,0,0.941088,1,0,9.998813e-01,1,0,0.67
3,6201_1,0,0,0.606473,0,0,9.981054e-02,0,0,0.59
4,6392_3,0,0,0.110718,0,0,1.891798e-01,0,0,0.20
...,...,...,...,...,...,...,...,...,...,...
24995,1333_10,1,4,0.895544,1,4,9.761979e-01,1,4,0.60
24996,1831_2,0,4,0.181402,0,4,4.117762e-02,0,4,0.52
24997,8899_8,1,4,0.690255,1,4,2.262155e-01,1,4,0.60
24998,3520_2,0,4,0.013608,0,4,3.623739e-10,0,4,0.16


In [17]:
pred_cols = ["lr_pred", "lr_cnt_pred", "rf_svd_pred"]
for col in pred_cols:
    auc = metrics.roc_auc_score(df_blending.sentiment, df_blending[col])
    print(f"{col} overall_auc={auc}")

lr_pred overall_auc=0.9351656000000002
lr_cnt_pred overall_auc=0.9454780608000001
rf_svd_pred overall_auc=0.8799559296


In [18]:
targets = df_blending.sentiment
avg_preds = (df_blending["lr_pred"] + df_blending["lr_cnt_pred"] + df_blending["rf_svd_pred"])/3
something_preds = (df_blending["lr_pred"] + 3*df_blending["lr_cnt_pred"] + df_blending["rf_svd_pred"])/5
rank_preds = (df_blending["lr_pred"].rank() + df_blending["lr_cnt_pred"].rank() + df_blending["rf_svd_pred"].rank())/3
weighted_rank_preds = (df_blending["lr_pred"].rank() + 3*df_blending["lr_cnt_pred"].rank() + df_blending["rf_svd_pred"].rank())/5

print(f"auc (averaged):", metrics.roc_auc_score(targets, avg_preds))
print(f"auc (weighted avg):", metrics.roc_auc_score(targets, something_preds))
print(f"auc (rank avg):", metrics.roc_auc_score(targets, rank_preds)) # roc is scale invariant of preds
print(f"auc (weighted rank avg):", metrics.roc_auc_score(targets, weighted_rank_preds)) # roc is scale invariant of preds

auc (averaged): 0.9495438655999999
auc (weighted avg): 0.9507057407999999
auc (rank avg): 0.9446964480000001
auc (weighted rank avg): 0.9506141344000001


### Optimize AUC

In [19]:
from scipy.optimize import fmin # minimizer
from functools import partial

In [20]:
 class OptimizeAUC:
        def __init__(self):
            self.coef_ = None
        
        def _auc(self, coef, X, y):
            x_coef = X * coef
            predictions = np.sum(x_coef, axis=1)
            auc_score = metrics.roc_auc_score(y, predictions)
            return -1.0 * auc_score # since we use fmin
            
        def fit(self, X, y):
            # think of: partial_loss(coef) = _auc(coef, X, y)
            partial_loss = partial(self._auc, X=X, y=y) 
            init_coef = np.random.dirichlet(np.ones(X.shape[1]))
            
            # find coef such that auc of weighted preds is maximized
            self.coef_ = fmin(partial_loss, init_coef, disp=True) 
            
        def predict(self, X):
            x_coef = X * self.coef_
            predictions = np.sum(x_coef, axis=1)
            return predictions

In [26]:
# test
def f(x, y):
    return x**2 + y

print(fmin(partial(f, y=3), 100, disp=True))

# shows min value 3.000, returns minimum, i.e. x=0

Optimization terminated successfully.
         Current function value: 3.000000
         Iterations: 24
         Function evaluations: 48
[0.]


In [21]:
def run_training(pred_df, fold):
    train_df = pred_df[pred_df.kfold != fold]
    valid_df = pred_df[pred_df.kfold == fold]
    
    xtrain = train_df[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']]
    xvalid = valid_df[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']]
    
    opt = OptimizeAUC()
    opt.fit(xtrain, train_df.sentiment.values)
    preds = opt.predict(xvalid)
    auc = metrics.roc_auc_score(valid_df.sentiment.values, preds)
    print(f"fold={fold} auc={auc}")
    print()
    
    return opt.coef_

In [22]:
run_training(df_blending, 0)

Optimization terminated successfully.
         Current function value: -0.950305
         Iterations: 43
         Function evaluations: 91
fold=0 auc=0.9539662400000001



array([0.2613635 , 0.50392249, 0.09666369])

In [23]:
targets = df_blending.sentiment.values
pred_cols = ['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']

coefs = []
for j in range(5):
    coefs.append(run_training(df_blending, j))

best_coefs = sum(coefs)/5 
# need to get one constant for each col, since these are weights of model predictions

Optimization terminated successfully.
         Current function value: -0.950306
         Iterations: 54
         Function evaluations: 115
fold=0 auc=0.9539444800000001

Optimization terminated successfully.
         Current function value: -0.951833
         Iterations: 50
         Function evaluations: 99
fold=1 auc=0.94782448

Optimization terminated successfully.
         Current function value: -0.950894
         Iterations: 49
         Function evaluations: 105
fold=2 auc=0.95164496

Optimization terminated successfully.
         Current function value: -0.951133
         Iterations: 37
         Function evaluations: 88
fold=3 auc=0.95064848

Optimization terminated successfully.
         Current function value: -0.951040
         Iterations: 72
         Function evaluations: 139
fold=4 auc=0.9510171199999999



In [27]:
opt_preds = (df_blending[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']] * best_coefs).sum(axis=1)
metrics.roc_auc_score(df_blending.sentiment.values, opt_preds)

0.951036608

In [None]:
targets = df_blending.sentiment.values
pred_cols = ['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']

coefs = []
for j in range(5):
    coefs.append(run_training(df_blending, j))

best_coefs = sum(coefs)/5

In [30]:
opt_preds = (df_blending[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']].rank() * best_coefs).sum(axis=1)
metrics.roc_auc_score(df_blending.sentiment.values, opt_preds)

0.9514014848000001

In [37]:
targets = df_blending.sentiment.values
pred_cols = ['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']
df_blending_rank = df_blending.copy()
for col in pred_cols:
    df_blending_rank[col] = df_blending_rank[col].rank()

coefs = []
for j in range(5):
    coefs.append(run_training(df_blending_rank, j))

best_coefs = sum(coefs)/5

opt_preds = (df_blending_rank[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']] * best_coefs).sum(axis=1)
metrics.roc_auc_score(df_blending.sentiment.values, opt_preds)

Optimization terminated successfully.
         Current function value: -0.950739
         Iterations: 55
         Function evaluations: 107
fold=0 auc=0.9541545599999999

Optimization terminated successfully.
         Current function value: -0.952349
         Iterations: 55
         Function evaluations: 112
fold=1 auc=0.94771632

Optimization terminated successfully.
         Current function value: -0.951692
         Iterations: 71
         Function evaluations: 127
fold=2 auc=0.9503804799999999

Optimization terminated successfully.
         Current function value: -0.951242
         Iterations: 48
         Function evaluations: 101
fold=3 auc=0.9521368

Optimization terminated successfully.
         Current function value: -0.951109
         Iterations: 46
         Function evaluations: 93
fold=4 auc=0.9526273599999999



0.9514188544

Weighted rank averaging = best score.

## Stacking

Instead of fixed constants for stacking, we learn the weights using logistic regression. Then, pass the results through a sigmoid. Note that we use the **same folds** to avoid **data leakage**. Otherwise, transferring between folds allow later OOF data points to have been trained with data points in the folds.

In [43]:
from sklearn.preprocessing import StandardScaler

def run_training_lr(pred_df, fold):
    train_df = pred_df[pred_df.kfold != fold]
    valid_df = pred_df[pred_df.kfold == fold]
    
    xtrain = train_df[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']]
    xvalid = valid_df[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']]
    
    opt = linear_model.LinearRegression() # replace optimizer
    scl = StandardScaler()
    xtrain = scl.fit_transform(xtrain)
    xvalid = scl.transform(xvalid)
    
    opt.fit(xtrain, train_df.sentiment.values)
    preds = opt.predict(xvalid)
    auc = metrics.roc_auc_score(valid_df.sentiment.values, preds)
    print(f"fold={fold} auc={auc}")
    print()
    
    return opt.coef_


targets = df_blending.sentiment.values
pred_cols = ['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']
df_blending_rank = df_blending.copy()
for col in pred_cols:
    df_blending_rank[col] = df_blending_rank[col].rank()

coefs = []
for j in range(5):
    coefs.append(run_training_lr(df_blending_rank, j))

best_coefs = sum(coefs)/5

opt_preds = (df_blending_rank[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']] * best_coefs).sum(axis=1)
metrics.roc_auc_score(df_blending.sentiment.values, opt_preds)

fold=0 auc=0.95407376

fold=1 auc=0.947704

fold=2 auc=0.95053984

fold=3 auc=0.9520932800000002

fold=4 auc=0.95258208



0.9513987968

In [44]:
from sklearn.preprocessing import StandardScaler

def run_training_logreg(pred_df, fold):
    train_df = pred_df[pred_df.kfold != fold]
    valid_df = pred_df[pred_df.kfold == fold]
    
    xtrain = train_df[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']]
    xvalid = valid_df[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']]
    
    opt = linear_model.LogisticRegression() # replace optimizer
    scl = StandardScaler()
    xtrain = scl.fit_transform(xtrain)
    xvalid = scl.transform(xvalid)
    
    opt.fit(xtrain, train_df.sentiment.values)
    preds = opt.predict_proba(xvalid)[:, 1]
    auc = metrics.roc_auc_score(valid_df.sentiment.values, preds)
    print(f"fold={fold} auc={auc}")
    print()
    
    return opt.coef_


targets = df_blending.sentiment.values
pred_cols = ['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']
df_blending_rank = df_blending.copy()
for col in pred_cols:
    df_blending_rank[col] = df_blending_rank[col].rank()

coefs = []
for j in range(5):
    coefs.append(run_training_logreg(df_blending_rank, j))

best_coefs = sum(coefs)/5

opt_preds = (df_blending_rank[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']] * best_coefs).sum(axis=1)
metrics.roc_auc_score(df_blending.sentiment.values, opt_preds)

fold=0 auc=0.95417056

fold=1 auc=0.9477067199999999

fold=2 auc=0.95033168

fold=3 auc=0.9521486399999999

fold=4 auc=0.9526303999999999



0.9514181759999999

In [49]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

def run_training_xgb(pred_df, fold):
    train_df = pred_df[pred_df.kfold != fold]
    valid_df = pred_df[pred_df.kfold == fold]
    
    xtrain = train_df[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']]
    xvalid = valid_df[['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']]
    
    opt = XGBClassifier() # replace optimizer    
    opt.fit(xtrain, train_df.sentiment.values)
    preds = opt.predict_proba(xvalid)[:, 1]
    auc = metrics.roc_auc_score(valid_df.sentiment.values, preds)
    print(f"fold={fold} auc={auc}")
    print()
    
    valid_df.loc[:, 'xgb_pred'] = preds
    return valid_df


targets = df_blending.sentiment.values
pred_cols = ['lr_pred', 'lr_cnt_pred', 'rf_svd_pred']
df_blending_rank = df_blending.copy()
for col in pred_cols:
    df_blending_rank[col] = df_blending_rank[col].rank()

preds_df = []
for j in range(5):
    preds_df.append(run_training_xgb(df_blending_rank, j))
preds_df = pd.concat(preds_df)

metrics.roc_auc_score(df_blending.sentiment.values, preds_df.xgb_pred)



fold=0 auc=0.95069512



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


fold=1 auc=0.9442696799999999



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


fold=2 auc=0.94511648



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


fold=3 auc=0.9470412800000001



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


fold=4 auc=0.94813072



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


0.9469251263999999