<a href="https://colab.research.google.com/github/onlyabhilash/Advanced_NLP/blob/main/ensemble_learning/imdb_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from scipy.stats import mode

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

from keras.datasets import imdb

In [None]:
word_to_index = imdb.get_word_index()
index_to_word = [None] * (max(word_to_index.values()) + 1)
for w, i in word_to_index.items():
    index_to_word[i] = w

In [None]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)
X_train = [
    ' '.join(
        index_to_word[i]
        for i in X_train[i]
        if i < len(index_to_word)
    ) for i in range(X_train.shape[0])
]

X_test = [
    ' '.join(
        index_to_word[i]
        for i in X_test[i]
        if i < len(index_to_word)
    ) for i in range(X_test.shape[0])
]

In [None]:
## MAX VOTING via taking the mode of all the predictions
model1 = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('log', LogisticRegression())
])
model2 = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('knn', KNeighborsClassifier())
])
model3 = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('tree', DecisionTreeClassifier())
])

model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)

pred1=model1.predict(X_test)
pred2=model2.predict(X_test)
pred3=model3.predict(X_test)

final_pred = np.array([])
for i in range(0,len(X_test)):
    final_pred = np.append(final_pred, mode([pred1[i], pred2[i], pred3[i]]))

In [None]:
## AVERAGING
model1 = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('log', LogisticRegression())
])
model2 = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('knn', KNeighborsClassifier())
])
model3 = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('tree', DecisionTreeClassifier())
])

model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)

pred1=model1.predict_proba(X_test)
pred2=model2.predict_proba(X_test)
pred3=model3.predict_proba(X_test)

final_pred = (pred1+pred2+pred3)/3

In [None]:
## BAGGING via Random Forests
rf_model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('rf', RandomForestClassifier(n_estimators=28))
])

rf_model.fit(X_train, y_train)
rf_model.score(X_test,y_test)

0.7994

In [None]:
## BOOSTING via AdaBoost
ada_model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('ada', AdaBoostClassifier(random_state=1))
])
ada_model.fit(X_train, y_train)
ada_model.score(X_test,y_test)

0.80824