### Second Year Project
## Natural Language Processing
Group 10 -  Fillip Due, Andreas Olsen, Louis Brandt, Emma Bisgaard



### Description

Describe siUU

## 1 - Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import gzip
import json
import copy

### 1.1 - Loading the Data

In [2]:
PATH = {'train':'../data/music_reviews_train.json.gz',
        'dev': '../data/music_reviews_dev.json.gz',
        'test': '../data/music_reviews_test_masked.json.gz'}

In [3]:
def load_data(path):
    '''
    Function to load the data from json.gz
    -----
    Takes in the argument: 
        'path' - takes the form PATH['(train, dev or test)']
    '''
    dic = {}
    for i, line in enumerate(gzip.open(path)):
        review_data = json.loads(line)
        dic[i] = {}
        for key,value in review_data.items():
            dic[i][key] = value
    return dic

In [4]:
train_data = load_data(PATH['train'])
dev_data = load_data(PATH['dev'])
test_data = load_data(PATH['test'])

### 1.2 - Data Cleaning

In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [6]:
def sent_encode(sent):
    '''
    Helper function to encode sentiment
    ------
    Takes in string description
        'sent' - either positive or negative
    Returns binary encoding
        1 = positive sentiment
        0 = negative sentiment
    '''
    if sent == 'positive':
        return 1
    if sent == 'negative':
        return 0 
    return '_'

In [7]:
def clean(data):
    '''
    Function to clean the data
    -----
    Takes in data set from load_data()
        'data' - nested dictionary  
    Returns two lists
        cleaned - X list of tuples (id,[text])
        ys - y list
    '''
    cleaned = [] 
    ys = []
    for idx in data:
        review = data[idx].get('reviewText', None) # some data does not have a review text
        summary = data[idx].get('summary', None) # some data does not have a summary 
        
        # combine summary and review
        if review == None and summary == None:
            text = ''
        elif review == None:
            text = summary
        elif summary == None:
            text = review
        else:
            text = summary + ' ' + review
        text = text.lower()
        sequence = word_tokenize(text)  # splits gotta into got ta
        cleaned.append(sequence)

        # encode sentiment
        ys.append(sent_encode(data[idx]['sentiment']))

    return cleaned, ys

In [8]:
train_clean, y_train = clean(train_data)
dev_clean, y_dev = clean(dev_data)
test_clean, _ = clean(test_data)

In [9]:
# combine dev and train for cross validation
train_clean += dev_clean    # id for test set < 100000 
y_train += y_dev            # id for dev set > 100000

In [10]:
# most common tokens throughout data 
from collections import Counter
a = train_clean + test_clean
l = []
for x in a:
    for _ in x:
        l.append(_)
c = Counter(l)
c.most_common()

[('.', 331725),
 ('the', 273443),
 (',', 221454),
 ('i', 171437),
 ('and', 150827),
 ('a', 142351),
 ('to', 139644),
 ('of', 126966),
 ('it', 125572),
 ('this', 125010),
 ('is', 111526),
 ('!', 90968),
 ('song', 63223),
 ('in', 63127),
 ('that', 60378),
 ("'s", 57307),
 ('for', 54303),
 ('not', 51610),
 ('you', 51037),
 ('on', 49054),
 ("''", 46908),
 ('was', 45938),
 ('but', 44202),
 ("n't", 42440),
 ('with', 40876),
 ('album', 40159),
 ('music', 39751),
 ('``', 39574),
 ('my', 39005),
 ('great', 35638),
 ('one', 35035),
 ('have', 33489),
 ('as', 32834),
 ('like', 32786),
 ('love', 32272),
 ('are', 31905),
 ('stars', 26957),
 ('cd', 26407),
 ('from', 25939),
 ('songs', 25866),
 ('be', 25120),
 ('all', 24916),
 ('do', 23970),
 ('just', 23915),
 ('good', 23881),
 ('so', 23479),
 (')', 23050),
 ('(', 21484),
 ('...', 21359),
 ('me', 21321),
 ('they', 20617),
 ('if', 20578),
 ('five', 19783),
 ('what', 18973),
 ('his', 18899),
 ('has', 18583),
 ('at', 18477),
 ('?', 18172),
 ('very', 1811

### 1.3 Generating Difficult Cases

In [11]:
import random

1.3.1 - Positive Negative Synoyms 

In [12]:
pos_adj = ['good', 'great', 'excellent', 'amazing', 'extraordinary', 'beautiful', 'fantastic', 'nice', 'incredible', 'exceptional', 'awesome', 'perfect', 'fun', 'happy', 'adorable', 'brilliant', 'exciting', 'sweet', 'wonderful','stupendous','helpful']
neg_adj = ['awful', 'bad', 'horrible', 'weird', 'rough', 'lousy', 'unhappy', 'average', 'difficult', 'poor', 'sad', 'frustrating', 'hard', 'lame', 'nasty', 'annoying', 'boring', 'creepy', 'dreadful', 'ridiculous', 'terrible', 'ugly', 'unpleasant','shame']

In [52]:
def gen_synonyms(data):
    '''
    Function which generates data where positive/negative adjectives are replaced with synonyms
    -------
    Takes in cleaned data
    Returns new instances
    '''
    new = []
    indexes= []
    ys = []
    t = 0
    i = 0
    while t < 300: # number of cases generated
        c = False
        text = data[i].copy()
        for idx, token in enumerate(text):
            if token in pos_adj:
                text[idx] = random.choice(pos_adj)
                c = True
            if token in neg_adj:
                text[idx] = random.choice(neg_adj)
                c = True
        if c: 
            new.append(text)
            indexes.append(i)
            t +=1
            ys.append(y_train[i])
        i += 1
        
    return new, indexes, ys


In [53]:
synonyms, indexes_synonyms, y_synonyms  = gen_synonyms(train_clean)

1.3.2 - Not Variation

In [15]:
def sent_swap(sent):
    if sent == 1:
        return 0
    if sent == 0:
        return 1

In [51]:
def gen_nots(data):
    '''
    Function which generates data where nots are removed
    -------
    Takes in cleaned data
    Returns new instances
    '''
    new = []
    indexes= []
    ys = []
    t = 0
    i = 0
    while t < 300: # number of cases generated
        c = False
        text = data[i].copy()
        for idx, token in enumerate(text):
            if token in ['not','n\'t','not\'']:
                text[idx] = ''
                c = True
        if c: 
            new.append(text)
            indexes.append(i)
            t +=1
            ys.append(sent_swap(y_train[i]))
        i += 1
    return new, indexes, ys 

 

In [54]:
nots, indexes_nots, y_nots = gen_nots(train_clean)

1.3.3 - Name Entity Recognition

In [18]:
import spacy
import random
from spacy import displacy
from collections import Counter
import en_core_web_sm

In [63]:
def gen_NER(data:list, no_instances):
    '''
    This function takes in dataset in json format as we know it, and replaces the text where the sequence lables are PERSONS to 
    whatever name given in the list. This is done in correspondance with the Named Entity Extraction by sequence labeling each word
    to then extract and exchange the ones that are of the tag Proper Noun and PERSON.
    This is done with the help from the library en_core_web from Spacy which needs to be installed.
    https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da
    '''
    working_data = data.copy()    
    
    nlp = en_core_web_sm.load()
    
    ## find reviews with persons in them, to actually impact the reviews.
    reviews_to_change = {}
    for i in range(2000):
        sequence = ' '.join(working_data[i])
        doc = nlp(sequence) # generate sequence labeling and entity extraction
        for token in doc.ents:
            if token.label_ == 'PERSON':
                reviews_to_change[i] = sequence
            else: 
                continue
        if len(reviews_to_change) == no_instances:
            break
            
    ## change persons to ridiculous/random names to confuse the model consisting purposefully of adjectives, nouns and proper nouns
    names = ['Louis Top Boy', 'Andreas Bad Energy', 'Good will Bisgaard', 'Filip Fine', 'rolling peppers', 'The Animals', 'Black Sabbath', 'Jens Jensen', 'excellent Grasshoppers', 'butter soft harlekins', 'Basil', 'filet o\' fish']
    for i in reviews_to_change:
        doc = nlp(reviews_to_change[i])
        for token in doc.ents:
            if token.label_ == 'PERSON':
                replace = random.choice(names)
                reviews_to_change[i] = reviews_to_change[i].replace(token.text, replace)
    indexes = []
    new = []
    ys = []
    for i, string in reviews_to_change.items():
        indexes.append(i)
        new.append(string.split())
        ys.append(y_train[i])
    return new, indexes, ys

In [64]:
ners, indexes_ners, y_ners = gen_NER(train_clean,300)

### 1.4 - Generate Vocab, Corpus

In [21]:
def get_vocab_corpus(dataset):
    '''
    Function computing vocabluary and corpus for a dataset
    -----
    Takes a cleaned dataset - list 
        dataset - X list 
    Returns
        vocab - set of unique tokens in dataset
        corpus - list of strings; sentences in dataset 
    '''
    vocab = set()
    corpus = []
    for text in dataset: # for list in list of lists
        sentence = ''
        for token in text: # for token in list 
            vocab.add(token)
            if token in ['.','!','?',',',';',':']:
                sentence += token 
            else:
                sentence += ' ' + token 
        corpus.append(sentence.lstrip()) 
    return vocab, corpus

In [22]:
train_vocabulary, train_corpus = get_vocab_corpus(train_clean)
test_vocabulary, test_corpus = get_vocab_corpus(test_clean) # test vocab not used

### 1.5 - Bag of Words

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
def get_bow(vocab, corp):
    '''
    Function returning sparse matrix of Term Frequency — Inverse Document Frequencies
    -----
    Takes vocab and corpus, working with two lists
        vocab - set of unique words
        corpus - list of strings
    Returns bag of words
        bow - 2d matrix; input to model
    '''
    vocab = list(vocab) 
    vectorizer = TfidfVectorizer(vocabulary= vocab)
    bow = vectorizer.fit_transform(corp) 
    return bow 

In [25]:
train_bow = get_bow(train_vocabulary,train_corpus)
test_bow = get_bow(train_vocabulary,test_corpus)

## 2 - Baseline Model

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

### 2.1 Fit training data

In [27]:
lr = LogisticRegression()
parameters = {'max_iter':[100,500], 'C': [2,3,4,5]}
grid = GridSearchCV(lr, parameters)
grid.fit(train_bow, y_train)
grid.best_score_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.9334818181818182

### 2.2 Cross Validation

In [28]:
baseline_model = grid.best_estimator_
scores = cross_val_score(estimator= baseline_model,X= train_bow,y= y_train)
scores

array([0.93054545, 0.92636364, 0.93686364, 0.93572727, 0.93790909])

### 2.4 - Test Prediciton and Formatting

In [29]:
baseline_predictions = baseline_model.predict(test_bow)

In [30]:
def pred_test(test, ys):
    '''
    Function to insert predicitons into test data
    '''
    index = 0
    for key in test:
        test[key]['sentiment'] = reverse_encode(ys[index])
        index += 1
    return test

In [31]:
def reverse_encode(sent):
    if sent == 1:
        return 'positive'
    if sent == 0:
        return 'negative'

In [32]:
finished_test_data = pred_test(test_data,baseline_predictions)
finished_test_data

{0: {'verified': True,
  'reviewTime': '10 24, 2017',
  'reviewerID': 'A2HAJB8L9NVYTZ',
  'asin': 'B007Y1AMHE',
  'reviewText': 'ok',
  'summary': 'ok',
  'unixReviewTime': 1508803200,
  'sentiment': 'negative',
  'id': 0},
 1: {'verified': True,
  'reviewTime': '04 8, 2015',
  'reviewerID': 'AD78RH9JWBDEU',
  'asin': 'B007Y1AMHE',
  'reviewText': 'Its awesome',
  'summary': 'love it',
  'unixReviewTime': 1428451200,
  'sentiment': 'positive',
  'id': 1},
 2: {'verified': True,
  'reviewTime': '03 3, 2015',
  'reviewerID': 'A5UNQFT0JQ8B',
  'asin': 'B007Y1AMHE',
  'reviewText': 'great, really good!',
  'summary': 'Five Stars',
  'unixReviewTime': 1425340800,
  'sentiment': 'positive',
  'id': 2},
 3: {'verified': True,
  'reviewTime': '05 31, 2014',
  'reviewerID': 'A1NPTQTAYO51XW',
  'asin': 'B007Y1AMHE',
  'reviewText': 'THANK THEE LORD FOR THIS YOUNG MAN WISDOM, I PRAY THAT THEE YOUNGER GENERATION WILL UNDERSTAND & LIVE AS HE!!!',
  'summary': 'LISTEN & FOLLOW!!!',
  'unixReviewTime

### 2.5 - Write predicitons to file

In [33]:
def write_json(dict,filename): 
    dict_json=[json.dumps(i)+'\n' for i in dict.values()]
    with open (filename, 'a') as file:
        file.writelines(dict_json)

In [34]:
def insert_text(data, new, indexes, cat, ys, is_nots=False):
    d = copy.deepcopy(data) 
    d = {key:val for key,val in d.items() if key in indexes}
    for iidx, idx in enumerate(indexes):
        d[idx]['reviewText'] = ' '.join([str(tok) for tok in new[iidx]])
        d[idx]['category'] = cat
        if is_nots: d[idx]['sentiment'] = reverse_encode(ys[iidx])
    return d

In [65]:
train_syn = insert_text(train_data,synonyms,indexes_synonyms,'synonym', y_synonyms)

In [66]:
train_nots = insert_text(train_data, nots, indexes_nots, 'negation', y_nots, is_nots=True)

In [67]:
train_ners = insert_text(train_data, ners, indexes_ners, 'ner', y_ners)

In [68]:
# write_json(train_nots,'group10.json')

In [69]:
# write_json(train_syn,'group10.json')

In [70]:
# write_json(train_ners,'group10.json')