<a href="https://colab.research.google.com/github/maab2198/nlp2021/blob/labs/lab6/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 457 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = """
Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets.
"""

doc = nlp(text)
token_list = [token for token in doc]

print(token_list)

[
, Dave, watched, as, the, forest, burned, up, on, the, hill, ,, 
, only, a, few, miles, from, his, house, ., The, car, had, 
, been, hastily, packed, and, Marta, was, inside, trying, to, round, 
, up, the, last, of, the, pets, ., ", Where, could, she, be, ?, ", he, wondered, 
, as, he, continued, to, wait, for, Marta, to, appear, with, the, pets, ., 
]


In [7]:
filtered_tokens = [token for token in doc if not token.is_stop]
print(filtered_tokens)

[
, Dave, watched, forest, burned, hill, ,, 
, miles, house, ., car, 
, hastily, packed, Marta, inside, trying, round, 
, pets, ., ", ?, ", wondered, 
, continued, wait, Marta, appear, pets, ., 
]


In [8]:
lemmas = [
    f"Token: {token}, lemma: {token.lemma_}"
    for token in filtered_tokens
]

print(lemmas)

['Token: \n, lemma: \n', 'Token: Dave, lemma: Dave', 'Token: watched, lemma: watch', 'Token: forest, lemma: forest', 'Token: burned, lemma: burn', 'Token: hill, lemma: hill', 'Token: ,, lemma: ,', 'Token: \n, lemma: \n', 'Token: miles, lemma: mile', 'Token: house, lemma: house', 'Token: ., lemma: .', 'Token: car, lemma: car', 'Token: \n, lemma: \n', 'Token: hastily, lemma: hastily', 'Token: packed, lemma: pack', 'Token: Marta, lemma: Marta', 'Token: inside, lemma: inside', 'Token: trying, lemma: try', 'Token: round, lemma: round', 'Token: \n, lemma: \n', 'Token: pets, lemma: pet', 'Token: ., lemma: .', 'Token: ", lemma: "', 'Token: ?, lemma: ?', 'Token: ", lemma: "', 'Token: wondered, lemma: wonder', 'Token: \n, lemma: \n', 'Token: continued, lemma: continue', 'Token: wait, lemma: wait', 'Token: Marta, lemma: Marta', 'Token: appear, lemma: appear', 'Token: pets, lemma: pet', 'Token: ., lemma: .', 'Token: \n, lemma: \n']


In [9]:
filtered_tokens[1].vector

array([ 1.6193167e+00, -2.7117019e+00, -6.8552375e-01,  2.6652899e+00,
        4.5226312e+00,  2.8338575e+00,  6.1740106e-01,  9.5401168e-01,
        2.6201737e+00,  2.5994289e+00,  5.9061027e+00, -1.7552420e-01,
       -8.7880111e-01,  4.8553795e-03, -1.7236035e+00, -1.7494547e+00,
       -1.0313329e+00,  1.6518956e-01,  5.3024960e-01, -3.2018152e-01,
       -2.6411371e+00, -2.4750671e+00, -5.0014794e-01, -3.3213449e+00,
       -5.3300351e-01,  2.3968523e+00,  1.5485952e+00, -2.2231889e+00,
       -1.2597762e+00, -5.6858027e-01, -9.4768405e-02, -1.3759263e+00,
       -1.0165324e+00,  5.6860483e-01,  2.6817162e+00, -3.7418640e+00,
        2.7644300e+00, -1.9967061e+00, -2.9627855e+00, -1.0863459e-01,
        2.7437925e+00,  2.5450244e+00,  1.6124392e+00, -3.3037057e+00,
       -2.4419413e+00,  9.5868981e-01,  1.1957375e+00, -1.2429583e+00,
       -1.2961357e+00,  2.8916957e+00, -2.8091950e+00, -3.1826324e+00,
       -2.4809690e+00, -2.5254309e-01, -2.0454383e+00,  3.0948038e+00,
      

In [10]:
import os

os.chdir(os.path.relpath('./'))

In [11]:
import tarfile

fname = 'aclImdb_v1.tar.gz'
with tarfile.open(fname, "r:gz") as tar:
    tar.extractall()
    tar.close()

In [12]:
import os
import random                                  

def load_training_data(
    data_directory: str = "aclImdb/train",
    split: float = 0.8,
    limit: int = 0
) -> tuple:

    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}") as f:
                    text = f.read()
                    text = text.replace("<br />", "\n\n")
                    if text.strip():
                        spacy_label = {
                            "cats": {
                                "pos": "pos" == label,
                                "neg": "neg" == label}
                        }
                        reviews.append((text, spacy_label))
    random.shuffle(reviews)                    

    if limit:                                  
        reviews = reviews[:limit]              
    split = int(len(reviews) * split)          
    return reviews[:split], reviews[split:]    

In [13]:
load_training_data(
    data_directory = "aclImdb/train",
    split = 0.8,
    limit = 0)[0][0]

("In Hazzard County, Georgia, cousins Bo and Luke Duke (Scott, Knoxville) and their cousin Daisy Duke (Jessica Simpson) run moonshine made by their Uncle Jesse (Willie Nelson) while avoiding the local authority, Boss Hog (Burt Reynolds). Their problems with the Boss are only beginning as they learn he's been plotting to strip mine the town for valuable ores found below it.\n\n\n\nI have never seen the TV show and after watching the movie, I'm not going to start any time soon. I like stupid comedies but this one didn't offer many laughs. It was a pretty dull picture with the first hour being really hard to sit through. The second part was a little better but this film was still a missed opportunity. The film focused on Bo and Luke way too much. The characters in general weren't very interesting and the actors portraying them didn't do a very good job.\n\n\n\nThe acting wasn't very good. I wasn't expecting it to be good in the first place but none of the leads were very funny. Seann Will

In [15]:
def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)

    TP, FP, TN, FN = 1e-8, 0, 0, 0
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]['cats']
        score_pos = review.cats['pos'] 
        if true_label['pos']:
            if score_pos >= 0.5:
                TP += 1
            else:
                FN += 1
        else:
            if score_pos >= 0.5:
                FP += 1
            else:
                TN += 1    
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f_score = 2 * precision * recall / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

In [16]:
import os
import random
import spacy
from spacy.util import minibatch, compounding

def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20) -> None:
 # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

     # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
  
        print("Loss\t\tPrec.\tRec.\tF-score")          
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers
        for i in range(iterations):
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(
                    text,
                    labels,
                    drop=0.2,
                    sgd=optimizer,
                    losses=loss
                )
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(   
                    tokenizer=nlp.tokenizer,           
                    textcat=textcat,                   
                    test_data=test_data                
                )                                      
                print(f"{loss['textcat']:9.6f}\t\
{evaluation_results['precision']:.3f}\t\
{evaluation_results['recall']:.3f}\t\
{evaluation_results['f-score']:.3f}")
                
    # Save                                
    with nlp.use_params(optimizer.averages):           
        nlp.to_disk("model_artifacts")                 

In [17]:
train, test = load_training_data(limit=20000)
h = train_model(train, test, iterations=2)

Loss		Prec.	Rec.	F-score
16.189721	0.862	0.815	0.838
 0.174130	0.872	0.837	0.854
 0.081183	0.879	0.841	0.859
 0.073756	0.876	0.857	0.866
 0.061645	0.876	0.857	0.867
 0.053377	0.883	0.863	0.873


In [18]:
TEST_REVIEW = """
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)
"""

In [21]:
def test_model(input_data: str):

    loaded_model = spacy.load("model_artifacts")
    parsed_text = loaded_model(input_data)

    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "+"
        score = parsed_text.cats["pos"]
    else:
        prediction = "-"
        score = parsed_text.cats["neg"]
    print(f"text: {input_data}\n\
Prediction: {prediction}\n\
Score: {score:.3f}")

In [22]:
test_model(input_data=TEST_REVIEW)

text: 
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)

Prediction: +
Score: 0.992
