In [1]:
from collections import Counter
import pandas as pd
from string import punctuation
import spacy
nlp = spacy.load("en_core_web_sm")
import textblob

In [27]:
def tokenize_and_clean_text(text):
    """
    Method that implements part of NLP pipeline of cleaning text:
    1. Tokenization
    2. Removing stopwords (commonly known stopwords, pronouns, keywords with little info, words less than 2 chars)
    3. Lemmatization

    Args: 
    text to be tokenized and cleaned

    Returns: list of tokens (spacy doc) and list of lemmatized words (list of strings)
    """
    stopwords = spacy.lang.en.stop_words.STOP_WORDS

    post_doc = nlp(text)
    
    # Clean spacy doc and structure as 2D numpy array of spacy tokens
    cleaned_post_doc = []
    for sent in post_doc.sents:
        cleaned_sent = [
            token.lemma_.lower().strip() for token in sent 
            if not token.is_stop and not token.is_punct and token.lemma_ != '-PRON-'
        ]
        if len(cleaned_sent) > 0:
            cleaned_post_doc.append(cleaned_sent)
    
    return cleaned_post_doc

In [28]:
def flatten_text(text):
    return [token for sent in text for token in sent]

In [4]:
def get_most_common_keywords(post):
    cleaned_post = tokenize_and_clean_text(post)
    print(cleaned_post)
    flattened_cleaned_post = flatten_text(cleaned_post)
    print(flattened_cleaned_post)
    word_freq = Counter(flattened_cleaned_post)
    print(word_freq)

In [5]:
comment = 'FB is hiring a new chief product officer He\'s supposed to be a veteran in the internet business. the internet is great'
get_most_common_keywords(comment)

FB is hiring a new chief product officer He's supposed to be a veteran in the internet business. the internet is great
[['fb', 'hire', 'new', 'chief', 'product', 'officer'], ['suppose', 'veteran', 'internet', 'business'], ['internet', 'great']]
['fb', 'hire', 'new', 'chief', 'product', 'officer', 'suppose', 'veteran', 'internet', 'business', 'internet', 'great']
<class 'collections.Counter'>


In [11]:
import os
import re
import tensorflow as tf
from tensorflow import keras
from transformers import *
from sklearn.model_selection import train_test_split

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [12]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
        fname="aclImdb.tar.gz", 
        origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
        extract=True
    )

    train_df = load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "test"))

    return train_df, test_df

In [22]:
train, test = download_and_load_datasets()

In [31]:
def clean_sentence(sentence):
    return ' '.join(flatten_text(tokenize_and_clean_text(sentence)))

In [32]:
train['cleaned_sentence'] = train['sentence'].apply(
    lambda sentence: clean_sentence(sentence)
)

In [56]:
test['cleaned_sentence'] = test['sentence'].apply(
    lambda sentence: clean_sentence(sentence)
)

In [34]:
print(train['sentence'].iloc[0])
print(train['cleaned_sentence'].iloc[0])

David Webb Peoples meets Paul Anderson...if it already sounds weird to you, then you are right, because it is.<br /><br />Peoples is known for his scripts with moral implications of what is right and wrong, the value of life, etc... He covered these issues in Bladerunner, Unforgiven, and pretty much in all of his screenplays there is something along those lines.<br /><br />Paul Anderson's first successful movie was a violent thriller. Not surprisingly so have all of his other movies! And here is a violent thriller with moral implications!<br /><br />Peoples' script is quite apparent in the first half of the movie. Soldiers trained from birth, taught to kill, and never had a normal life. They are replaced by better, genetically engineered soldiers and Todd, one of the original soldiers, is left on a planet and left for dead. There he must cope with a group of refugees, some want him to stay others hate him and there is an interesting drama here. BUT THEN...<br /><br />...The bullets sta

In [35]:
DATA_COLUMN = 'cleaned_sentence'
LABEL_COLUMN = 'polarity'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

In [48]:
from textblob import TextBlob

def run_textblob_sentiment(sentences, sentiment_labels):
    sentiments = []
    for sentence in sentences:
        if TextBlob(sentence).polarity > 0:
            sentiments.append(1)
        else:
            sentiments.append(0)
    
    print('Model accuracy: ', accuracy_score(sentiments, sentiment_labels))

In [66]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

def run_random_forest_classification(train_features, test_features, y_train, y_test):
    max_features = int(.8 * train_features.shape[1])
    rf = RandomForestClassifier(max_depth=30, n_estimators=500, max_features=max_features, n_jobs=-1)
    
    rf.fit(train_features, y_train)
    train_pred = rf.predict(train_features)
    train_model_accuracy = accuracy_score(train_pred, y_train)
    
    test_pred = rf.predict(test_features)
    print('Dist of predictions: ', np.unique(test_pred, return_counts=True))
    print('Dist of actual deltas: ', np.unique(y_test, return_counts=True))
    test_model_accuracy = accuracy_score(test_pred, y_test)
    print('Test model accuracy: ', test_model_accuracy)
    
    filename = 'tendies/finalized_sentiment_model.joblib'
    joblib.dump(rf, filename)

    # load the model from disk
    '''
    loaded_model = joblib.load(filename)
    result = loaded_model.predict(X_test, Y_test)
    print(result)
    '''

In [61]:
vectorizer = TfidfVectorizer()
final_train = train[DATA_COLUMN].append(test[DATA_COLUMN])
X = vectorizer.fit_transform(final_train)
X_labels = train[LABEL_COLUMN].append(test[LABEL_COLUMN])
test_size = 0.2

train_features, val_features, train_labels, val_labels = train_test_split(
    X, X_labels, test_size=test_size, stratify=X_labels
)

In [62]:
train_features.shape

(40000, 91597)

In [67]:
run_random_forest_classification(
    train_features, val_features, train_labels, val_labels
)

Dist of predictions:  (array([0, 1]), array([4693, 5307]))
Dist of actual deltas:  (array([0, 1]), array([5000, 5000]))
Test model accuracy:  0.7945


In [52]:
run_textblob_sentiment(train[DATA_COLUMN].iloc[0:20000], train[LABEL_COLUMN][0:20000])

Model accuracy:  0.71715
