In [None]:
import nltk
import numpy as np
import re
from sklearn.datasets import load_files
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import warnings
warnings.simplefilter("ignore")

In [None]:
#dataset link : https://www.cs.cornell.edu/people/pabo/movie-review-data/
dataset = load_files("txt_sentoken/")

In [None]:
X,y = dataset.data, dataset.target
x_df = pd.DataFrame(X,columns=["text"])
Y = pd.DataFrame(y,columns=['class'])

In [None]:
def clean(row):
    review = re.sub(r'\n', ' ',str(row))
    #review = re.sub('\n+', '',str(row))
    review = re.sub(r'\W', ' ', review)
    review = review.lower()
    review = re.sub(r'^br$', ' ', review)
    review = re.sub(r'\s+br\s+',' ',review)
    review = re.sub(r'\s+[a-z]\s+', ' ',review)
    review = re.sub(r'^b\s+', '', review)
    review = re.sub(r'\s+', ' ', review)
    
    return review

In [None]:
x_df["clean"] = x_df["text"].apply(clean)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(pd.DataFrame(x_df['clean']),Y,test_size=0.25,shuffle=True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer(max_features=5000,stop_words=stopwords.words("english"))

x_bow_train = cv.fit_transform(x_train["clean"]).toarray()
x_bow_test = cv.transform(x_test["clean"]).toarray()

In [None]:
tfidf = TfidfVectorizer(max_features=5000,stop_words='english')

In [None]:
x_tfidf_train = tfidf.fit_transform(x_train['clean']).toarray()
x_tfidf_test = tfidf.transform(x_test['clean']).toarray()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
clf_bow = RandomForestClassifier(n_estimators=200)
clf_bow.fit(x_bow_train,y_train)
preds_bow = pd.DataFrame(clf_bow.predict(x_bow_test))
cm_bow = confusion_matrix(y_pred=preds_bow,y_true=y_test)
cm_bow

In [None]:
clf_tf = RandomForestClassifier(bootstrap='False', max_depth=4, max_features='sqrt',
                       min_samples_leaf=7, n_estimators=300, random_state=42)

clf_tf.fit(x_tfidf_train,y_train)
preds_tf = pd.DataFrame(clf_tf.predict(x_tfidf_test))
cm_tf = confusion_matrix(y_pred=preds_tf,y_true=y_test)
cm_tf

In [None]:
from sklearn.metrics import roc_auc_score
from skopt import BayesSearchCV
opt = BayesSearchCV(
    RandomForestClassifier(random_state=42),
    {
        'n_estimators': [5,50,100,150,200,300],
        'max_features': ['auto','sqrt'],
        'max_depth': [2,3,4],
        'min_samples_split': [2,3,4],
        'min_samples_leaf': [1,7],
        'bootstrap': ["True","False"]
    },
    n_iter=32,
    cv=3,
    scoring='roc_auc'
)

opt.fit(x_bow_train, y_train)

print("val. score: %s" % opt.best_score_)

In [None]:
opt.best_estimator_