In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import itertools

In [None]:
#use IMDB dataset with positive/negative reviews
data = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')

In [None]:
#preprocess data
data['content']=data['review'].str.replace('[^A-Za-z0-9\s]+', '')
data['content']=data['content'].str.replace('http\S+|www.\S+', '', case=False)
data['content']=data['content'].str.lower()

  """Entry point for launching an IPython kernel.


In [None]:
target=data.sentiment
data = data.drop(['sentiment'],axis=1)

In [None]:
#encode target
le=LabelEncoder()
target=le.fit_transform(target)

In [None]:
#split on train & test
X_train, X_test, y_train, y_test = train_test_split(data,target,stratify=target,
                                                    test_size=0.3, random_state=42)

In [None]:
#calculate accuracy
def print_acc(model, X_test, y_test):
    predicted = model.predict(X_test)
    accuracy = np.mean(predicted == y_test) * 100
    print(accuracy)
    return accuracy

In [None]:
#use stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
#lemmatize x
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
nltk.download('wordnet')

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

X_train_l = X_train.content.apply(lemmatize_text)
X_test_l = X_test.content.apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
vect = [CountVectorizer(stop_words=stop_words),
        CountVectorizer(), 
        HashingVectorizer(n_features=2500,alternate_sign=False),
        ]
tfidf = [TfidfTransformer(),
        None,
        ]
clf = [SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None),
       MultinomialNB(),
       ]
Xs = [[X_train.content, X_test.content], [X_train_l, X_test_l]]
best_acc = 0
best_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
for v, t, c, x in tqdm(itertools.product(vect, tfidf, clf, Xs)):
  x_tr, x_test = x
  nb_clf = Pipeline([('vect', v), ('tfidf', t), ('clf', c)])
  nb_clf = nb_clf.fit(x_tr, y_train)
  acc = print_acc(nb_clf, x_test, y_test)
  if acc > best_acc:
    best_acc = acc
    best = nb_clf

0it [00:00, ?it/s]

<class 'pandas.core.series.Series'>


1it [00:09,  9.22s/it]

84.77333333333334
<class 'pandas.core.series.Series'>


2it [00:18,  9.05s/it]

84.69333333333333
<class 'pandas.core.series.Series'>


3it [00:27,  9.03s/it]

86.66666666666667
<class 'pandas.core.series.Series'>


4it [00:35,  8.89s/it]

86.56
<class 'pandas.core.series.Series'>


5it [00:44,  8.82s/it]

88.9
<class 'pandas.core.series.Series'>


6it [00:56,  9.82s/it]

88.1
<class 'pandas.core.series.Series'>


7it [01:04,  9.38s/it]

85.98
<class 'pandas.core.series.Series'>


8it [01:13,  9.02s/it]

85.57333333333334
<class 'pandas.core.series.Series'>


9it [01:22,  9.24s/it]

84.48
<class 'pandas.core.series.Series'>


10it [01:32,  9.29s/it]

84.63333333333334
<class 'pandas.core.series.Series'>


11it [01:41,  9.37s/it]

86.36
<class 'pandas.core.series.Series'>


12it [01:50,  9.31s/it]

86.14666666666668
<class 'pandas.core.series.Series'>


13it [02:00,  9.30s/it]

85.7
<class 'pandas.core.series.Series'>


14it [02:09,  9.24s/it]

88.26
<class 'pandas.core.series.Series'>


15it [02:18,  9.19s/it]

84.95333333333333
<class 'pandas.core.series.Series'>


16it [02:27,  9.09s/it]

84.76666666666667
<class 'pandas.core.series.Series'>


17it [02:34,  8.50s/it]

82.92
<class 'pandas.core.series.Series'>


18it [02:41,  8.06s/it]

83.24666666666667
<class 'pandas.core.series.Series'>


19it [02:48,  7.74s/it]

81.66666666666667
<class 'pandas.core.series.Series'>


20it [02:55,  7.51s/it]

81.67999999999999
<class 'pandas.core.series.Series'>


21it [03:02,  7.34s/it]

77.61333333333333
<class 'pandas.core.series.Series'>


22it [03:09,  7.19s/it]

77.84
<class 'pandas.core.series.Series'>


23it [03:15,  7.09s/it]

81.19333333333333
<class 'pandas.core.series.Series'>


24it [03:22,  8.45s/it]

81.38





In [68]:
import pickle
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(best, picklefile)