In [7]:
import nbimporter
import DataOperations
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

In [18]:
def create_Vectorizer():
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
    print ("Creating the bag of words...\n")

    vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, 
                                 max_features = 20000) 


    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    data_features = vectorizer.fit_transform(clean_reviews)

    data_features = data_features.toarray()
    print("Created the bag of words")

    # Take a look at the words in the vocabulary
    vocab = vectorizer.get_feature_names()

    # # Sum up the counts of each vocabulary word
    dist = np.sum(data_features, axis=0)

    #The below code obtains the low frequency words
    threshold_count=2
    low_freq_words=[]
    
    # #for tag, count in zip(vocab, dist):
    low_freq_words=[tag  for tag, count in zip(vocab, dist) if(count<threshold_count)]
    
    return(data_features)


def tfidf_vectorizer(clean_reviews): 
    print("Preparing to vectorize the text using tf-idf")
    # This vectorizer breaks text into single words and bi-grams and then calculates the TF-IDF representation
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    # the 'fit' builds up the vocabulary from all the reviews while the 'transform' step turns each indivdual text into
    # a matrix of numbers.
    vectors = vectorizer.fit_transform(clean_reviews)
    print("Completed tf-idf vectorization of text data")
    return(vectors)


def gaussian_Naive_Bayes(vectors, stars):    
    from sklearn.naive_bayes import GaussianNB
    clf = GaussianNB()
    print("Running Gaussian Naive Bayes")
    clf.fit(vectors.toarray(), stars)
    scores = cross_val_score(clf, vectors.toarray(), stars, cv=2)
    #print("Getting Scores")
    #print(scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
def linear_SVM(vectors, stars):
    from sklearn.svm import LinearSVC
    classifier = LinearSVC()
    print("Running Linear SVM Classifier")
    classifier.fit(vectors.toarray(), stars)
    scores = cross_val_score(classifier, vectors.toarray(), stars, cv=2)
    #print("Getting Scores")
    #print(scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
def random_Forest_Classifier(vectors, stars):
    from sklearn.ensemble import RandomForestClassifier
    classifierRF = RandomForestClassifier()
    print("Running Random Forest Classifier")
    classifierRF.fit(vectors.toarray(), stars)
    scores = cross_val_score(classifierRF, vectors.toarray(), stars, cv=2)
    #print("Getting Scores")
    #print(scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
def tune_linear_svm(vectors, stars):
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import LinearSVC
    print("Obtaining best hyper-parameters for Linear SVM Classifier")
    grid={"C": [0.05,0.5,1,1.5,2,5,10], "loss": ["hinge", "squared_hinge"], "class_weight": [None,"balanced"]}

    train_X = vectors.toarray()
    train_y = stars

    clf = GridSearchCV(LinearSVC(), grid, cv=2).fit(train_X, train_y)

    print(clf.best_params_)
    print(str(clf.best_score_))
    
    return clf

def logistic_Regression(vectors, stars):
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression() 
    model.fit(vectors.toarray(), stars)
    scores = cross_val_score(model, vectors.toarray(), stars, cv=2)
    #print("Getting Scores")
    #print(scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [19]:
def allOperations(userID):
    reviews = DataOperations.readData()
    #Getting all reviews for the user with most reviews
    texts = [review['text'] for review in reviews if review['user_id'] == str(userID)]
    #Getting all ratings for the user with most reviews
    stars = [review['stars'] for review in reviews if review['user_id'] == str(userID)]
    clean_reviews = []
    clean_reviews = DataOperations.clean_Reviews(texts)
    vectors = tfidf_vectorizer(clean_reviews)
    gaussian_Naive_Bayes(vectors, stars)
    linear_SVM(vectors, stars)
    random_Forest_Classifier(vectors, stars)
    tune_linear_svm(vectors, stars)
    logistic_Regression(vectors, stars)