In [47]:
import pandas as pd
from wordcloud import WordCloud,STOPWORDS

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV

# Splitting train test dataset

In [48]:
imdb_data = pd.read_pickle('pickles/cleaned_data2.pkl')

X = imdb_data.review
y = imdb_data.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training Model

### Logistic Regression Model

In [54]:
from sklearn.linear_model import LogisticRegression


lr_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)),
])

# Fitting lr model
lr_clf.fit(X_train, y_train)

# Mesure test accuracy 
predicted = lr_clf.predict(X_test)
lr_accuracy = accuracy_score(y_test,predicted)
print('Accuracy score for Logistic Regression: ',lr_accuracy)



Accuracy score for Logistic Regression:  0.8904


### Naive Bayes Classifier

In [92]:
from sklearn.naive_bayes import MultinomialNB


nb_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

# Fitting lr model
nb_clf.fit(X_train, y_train)

# Mesure test accuracy 
predicted = nb_clf.predict(X_test)
nb_accuracy = accuracy_score(y_test,predicted)
print('Accuracy score for Naive Bayes Classifier: ',nb_accuracy)

Accuracy score for Naive Bayes Classifier:  0.8598


### Supprot Vector Classifier

In [58]:
from sklearn.svm import LinearSVC

svc_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

# Fitting svm classifier model
svc_clf.fit(X_train, y_train)

# Mesure test accuracy 
predicted = svc_clf.predict(X_test)
svc_accuracy = accuracy_score(y_test,predicted)
print('Accuracy score for SVM Classifier: ',svc_accuracy)

Accuracy score for SVM Classifier:  0.8933


### Regularized linear models with stochastic gradient descent (SGD) learning

In [81]:
from sklearn.linear_model import SGDClassifier

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

In [102]:
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
# Mesure test accuracy 
predicted = grid_search.predict(X_test)
sgd_accuracy = accuracy_score(y_test,predicted)
print('Accuracy score for SGD Classifier: ',sgd_accuracy)

Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Accuracy score for SGD Classifier:  0.909
