Data is taken from : http://ai.stanford.edu/~amaas/data/sentiment/



In [26]:
!pip install beautifulsoup4



In [27]:
import tarfile
import glob
import os
import nltk
import sklearn
import re
import pickle

from bs4 import BeautifulSoup

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

imdb_path = "./aclImdb"
stored_data_dir = "./storedData"
stopset = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
#Unzipping tar file is present

#tf = tarfile.open("aclImdb_v1.tar")
#tf.extractall()

In [0]:
# loading imdb data into data/labels variables
def load_imdb(directory):
  data = {}
  labels = {}
  for dataset_type in ["train", "test"]:
    data[dataset_type] = {}
    labels[dataset_type] = {}
    for outcome in ["neg", "pos"]:
      data[dataset_type][outcome] = []
      labels[dataset_type][outcome] = []
      
      actual_path = os.path.join(directory, dataset_type, outcome, '*.txt')
      files = glob.glob(actual_path)
      #print("{}/{} files {}" .format(dataset_type, outcome, len(files)))

      for f in files:
        with open(f) as file_data:
          data[dataset_type][outcome].append(file_data.read())
          labels[dataset_type][outcome].append(outcome)
  
  trainData = data['train']['neg'] + data['train']['pos']
  trainLabels = labels['train']['neg'] + labels['train']['pos']
  trainData, trainLabels = sklearn.utils.shuffle(trainData, trainLabels)
  testData = data["test"]['neg'] + data["test"]['pos']
  testLabels = labels["test"]['neg'] + data["test"]['pos']
  testData, testLabels = sklearn.utils.shuffle(testData, testLabels)

  return  trainData, trainLabels, testData, testLabels

In [30]:
trainData, trainLabels, testData, testLabels = load_imdb(imdb_path)
print("trainData {}  trainLabels {}" .format(len(trainData), len(trainLabels)))

trainData 25000  trainLabels 25000


In [31]:
# preprocessing 
# removing html tags
# removing punctuations
# tokenizing
# removing stopwords
# Lemmatization

def preprocessDataImpl(input_data):
  #print("input_data before: {}\n " .format(input_data))
  input_data = BeautifulSoup(input_data, 'html5lib').get_text().lower()
  input_data = re.sub(r"[^a-zA-Z0-9]", " ", input_data)
  #print("input_data after : {} " .format(input_data))

  tokenized_data = word_tokenize(input_data)
  #print("tokenized_data   : {} " .format(tokenized_data))

  tokenized_data = [word for word in tokenized_data if word not in stopset]
  #print("tokenized_data without stopwords   : {} " .format(tokenized_data))

  lemmatized_data = [WordNetLemmatizer().lemmatize(word) for word in tokenized_data];
  #print("lemmatized-data  : {} " .format(lemmatized_data))

  return lemmatized_data

# saving data with pickle during processing

def preprocessData(trainData, testData, trainLabels, testLabels, fileName = "preprocessed_data.pkl"):
    # If fileName is not None, try to read from it first
    cache_data = None
    if fileName is not None:
      try:
        with open(os.path.join(stored_data_dir, fileName), "rb") as f:
          cache_data = pickle.load(f)
        print("Read preprocessed data from cache file:", fileName)
      except:
        pass
    
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        words_train = list(map(preprocessDataImpl, trainData))
        words_test = list(map(preprocessDataImpl, testData))
        
        if fileName is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              trainLabels=trainLabels, testLabels=testLabels)
            with open(os.path.join(stored_data_dir, fileName), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", fileName)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

words_train, words_test, labels_train, labels_test = preprocessData(
        trainData, testData, trainLabels, testLabels)

Read preprocessed data from cache file: preprocessed_data.pkl


In [0]:
import numpy as np
from sklearn.externals import joblib

# Bag of words features from picle files
def extract_BoW_features(trainData, testData, vocabulary_size=5000,
                         cache_dir=stored_data_dir, fileName="bow_features.pkl"):
    
    cache_data = None
    if fileName is not None:
      try:
        with open(os.path.join(cache_dir, fileName), "rb") as f:
          cache_data = joblib.load(f)
        print("Read features from cache file:", fileName)
      except:
        pass
    if cache_data is None:
      print(trainData)
      vectorizer = CountVectorizer(max_features=vocabulary_size,
                preprocessor=lambda x: x, tokenizer=lambda x: x)
      features_train = vectorizer.fit_transform(trainData).toarray()
      features_test = vectorizer.transform(testData).toarray()

      if fileName is not None:
        vocabulary = vectorizer.vocabulary_
        cache_data = dict(features_train=features_train, features_test=features_test,
                          vocabulary=vocabulary)
        with open(os.path.join(cache_dir, fileName), "wb") as f:
          joblib.dump(cache_data, f)
        print("Wrote features to cache file:", fileName)
    else:
        features_train, features_test, vocabulary = (cache_data['features_train'],
          cache_data['features_test'], cache_data['vocabulary'])
    
    return features_train, features_test, vocabulary

In [0]:
features_train, features_test, vocabulary = extract_BoW_features(words_train, words_test)

In [0]:
# naiveBayes clasification
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import normalize

features_train = normalize(features_train, axis=1)
features_test = normalize(features_test, axis=1)

classifier = GaussianNB()
classifier.fit(features_train, labels_train)
print(classifier)

print(" scores :\n train {} \n test {} "
      .format(classifier.score(features_train, labels_train),
             classifier.score(features_test, labels_test)))

classifier = MultinomialNB()
classifier.fit(features_train, labels_train)
print(classifier)

print(" scores :\n train {} \n test {} "
      .format(classifier.score(features_train, labels_train),
             classifier.score(features_test, labels_test)))