In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.svm import SVC
from sklearn.linear_model import RidgeCV, LogisticRegression
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from joblib import Parallel, delayed
from mlxtend.plotting import plot_decision_regions
from sklearn.naive_bayes import MultinomialNB


# TRAIN DATA

In [6]:
df_train = pd.read_csv('train.csv')
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [7]:
from utils import prepropcess_data

df_train = prepropcess_data(df_train)


df_train_final = df_train[['final_text', 'target']]
df_train_final

Unnamed: 0,final_text,target
0,deed reason earthquake may allah forgive u,1
1,forest fire near la ronge sask canada,1
2,resident ask place notify officer evacuation s...,1
3,people receive wildfire evacuation order calif...,1
4,get sent photo ruby alaska smoke wildfire pour...,1
...,...,...
7608,two giant crane hold bridge collapse nearby ho...,1
7609,thetawniest control wild fire california even ...,1
7610,utc volcano hawaii http,1
7611,police investigate collide car little portugal...,1


In [5]:
X_train = df_train_final['final_text'].values
y_train = df_train_final['target'].values

In [16]:
X_train, y_train

(array(['refugio oil spill may costlier big project plain american pipeline oil spill http',
        'julian knight scvsupremecourt dismisses mass murderer attempt increase prisoner pay challenged quantum increase',
        'electricity cant stop scofield nigga survive hotbox sona', ...,
        'great british bake back dorret chocolate gateau collapse jan moir http http',
        'black eye space battle occur star involve fleet total ship destroyed',
        'mikeparractor absolutely devastate actor miss rossbarton every girl love bad boy'],
       dtype=object),
 array([1, 1, 0, ..., 1, 0, 0], dtype=int64))

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)

In [18]:
X_train

<5709x10836 sparse matrix of type '<class 'numpy.float64'>'
	with 50269 stored elements in Compressed Sparse Row format>

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=0)


# Stacking class

In [20]:
class Stacking:
    def __init__(self, estimators, final_estimator, blending=False, cv=5, n_jobs=-1):
        self.estimators = estimators
        self.final_estimator = final_estimator
        self.blending = blending
        self.cv = cv
        self.n_jobs = n_jobs

    def _X_pred(self, estimator, data):
        if self.blending:
            X_train_v, y_train_v, X_val = data
            return estimator.fit(X_train_v, y_train_v).predict(X_val)
        else:
            X_train, y_train = data
            return cross_val_predict(estimator, X_train, y_train, cv=self.cv)

    def _X_test_pred(self, estimator, data):
        X_train, y_train, X_test = data

        return estimator.fit(X_train, y_train).predict(X_test)

    def _meta_data(self, X_train, y_train, X_test):
        if self.blending:
            #used hold-out cross-validation
            X_train_v, X_val, y_train_v, y_val = train_test_split(X_train, y_train, random_state=0)
            train_data = [X_train_v, y_train_v, X_val]
            test_data = [X_train_v, y_train_v, X_test]
            meta_y_train = y_val
        else:
            train_data = [X_train, y_train]
            test_data = [X_train, y_train, X_test]
            meta_y_train = y_train

        cv_X_train_preds = (delayed(self._X_pred)(est, train_data) for est in self.estimators)
        X_test_preds = (delayed(self._X_test_pred)(est, test_data) for est in self.estimators)

        meta_X_train = pd.DataFrame(Parallel(n_jobs=self.n_jobs)(cv_X_train_preds))
        meta_X_test = pd.DataFrame(Parallel(n_jobs=self.n_jobs)(X_test_preds))

        return meta_X_train.T, meta_y_train, meta_X_test.T

    def fit_predict(self, X_train, y_train, X_test):
        # meta learner or blender
        meta_X_train, meta_y_train, meta_X_test = self._meta_data(X_train, y_train, X_test)

        return self.final_estimator.fit(meta_X_train, meta_y_train).predict(meta_X_test)
    
def decision_boundary_plot(X, y, X_train, y_train, clf, feature_indexes, title=None):
    feature1_name, feature2_name = X.columns[feature_indexes]
    X_feature_columns = X.values[:, feature_indexes]
    X_train_feature_columns = X_train.values[:, feature_indexes]
    clf.fit(X_train_feature_columns, y_train.values)

    plot_decision_regions(X=X_feature_columns, y=y.values, clf=clf)
    plt.xlabel(feature1_name)
    plt.ylabel(feature2_name)
    plt.title(title)

# TRAIN AND PREDICT

In [24]:
estimators = [LogisticRegression(C=10, penalty='l2', random_state=0),
              RandomForestClassifier(max_depth=None, n_estimators=50, random_state=0),
              MultinomialNB(alpha=1, fit_prior=True)]

estimator = RandomForestClassifier(random_state=0)
stacking_clf = Stacking(estimators=estimators, final_estimator=estimator)
stacking_pred_res = stacking_clf.fit_predict(X_train, y_train, X_test)
stacking_accuracy = accuracy_score(y_test, stacking_pred_res)
stacking_f1 = f1_score(y_test, stacking_pred_res)
print(f'stacking_accuracy: {stacking_accuracy}')
print(f'stacking_f1: {stacking_f1}')

print(stacking_pred_res, '', sep='\n')


stacking_accuracy: 0.792016806722689
[1 0 0 ... 0 1 1]



In [28]:
stacking_f1 = f1_score(y_test, stacking_pred_res)
print(f'stacking_accuracy: {stacking_accuracy}')
print(f'stacking_f1: {stacking_f1}')

stacking_accuracy: 0.792016806722689
stacking_f1: 0.7341092211280216


In [25]:
blending_clf = Stacking(estimators=estimators, final_estimator=estimator, blending=True)
blending_pred_res = blending_clf.fit_predict(X_train, y_train, X_test)
blending_accuracy = accuracy_score(y_test, blending_pred_res)
print(f'blending_accuracy: {blending_accuracy}')
print(blending_pred_res)

blending_accuracy: 0.780812324929972
[1 0 0 ... 0 1 1]
