In [1]:
import os
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

from data_preparation import DataPrep
from feature_engineering import FeatureEngineer

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

#from cnn_text import CNN_Text
#from rnn_text import RNN_Text

In [2]:
def train_model(classifier, feature_vector_train, train_label, feature_vector_valid, valid_label):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, train_label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.accuracy_score(predictions, valid_label)

In [3]:
def logistic_regression_model(data, features):
    # Linear Classifier on Count Vectors
    accuracy = train_model(linear_model.LogisticRegression(), features.X_train_count, data.y_train, features.X_valid_count, data.y_valid)
    print("LR, Count Vectors Validation: ", accuracy) # 0.6958262980160406

    # Linear Classifier on Word Level TF IDF Vectors
    accuracy = train_model(linear_model.LogisticRegression(), features.X_train_tfidf, data.y_train, features.X_valid_tfidf, data.y_valid)
    print("LR, WordLevel TF-IDF Validation: ", accuracy) # 0.6873733642887294

    # Linear Classifier on Ngram Level TF IDF Vectors
    accuracy = train_model(linear_model.LogisticRegression(), features.X_train_tfidf_ngram, data.y_train, features.X_valid_tfidf_ngram, data.y_valid)
    print("LR, N-Gram Vectors Validation: ", accuracy) # 0.660658505698607
    
    # Linear Classifier on Character Level TF IDF Vectors
    accuracy = train_model(linear_model.LogisticRegression(), features.X_train_tfidf_ngram_chars, data.y_train, features.X_valid_tfidf_ngram_chars, data.y_valid)
    print("LR, CharLevel Vectors Validation: ", accuracy) # 0.6990977205571971

In [10]:
def logistic_regression_model_v2(data):
    tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

    param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [None],
               'vect__tokenizer': [None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [None],
               'vect__tokenizer': [None],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

    lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

    gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)
    
    gs_lr_tfidf.fit(data.X_train, data.y_train)

    print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
    print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_) # 0.699
    
    clf = gs_lr_tfidf.best_estimator_
    print('Test Accuracy: %.3f' % clf.score(data.X_test, data.y_test)) # 0.699

In [11]:
def main():
    data = DataPrep()

    features = FeatureEngineer(data)

    logistic_regression_model(data, features)
    
    logistic_regression_model_v2(data)
    
    #cnn_model()
    
    #rnn_model()

In [12]:
if __name__ == "__main__":
    main()
    

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 19.5min finished


Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': None} 
CV Accuracy: 0.699
Test Accuracy: 0.699


In [None]:
learning_rate = 2e-5
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300

model = LSTMClassifier(vocab_size, , word_embeddings)
loss_fn = F.cross_entropy

for epoch in range(10):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc = eval_model(model, valid_iter)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
    
test_loss, test_acc = eval_model(model, test_iter)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

''' Let us now predict the sentiment on a single sentence just for the testing purpose. '''
test_sen1 = "This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues."
test_sen2 = "Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money."

test_sen1 = TEXT.preprocess(test_sen1)
test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]

test_sen2 = TEXT.preprocess(test_sen2)
test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]

test_sen = np.asarray(test_sen1)
test_sen = torch.LongTensor(test_sen)
test_tensor = Variable(test_sen, volatile=True)
test_tensor = test_tensor.cuda()
model.eval()
output = model(test_tensor, 1)
out = F.softmax(output, 1)
if (torch.argmax(out[0]) == 1):
    print ("Sentiment: Positive")
else:
    print ("Sentiment: Negative")