### Second Year Project
## Natural Language Processing
Group 10 -  Fillip Due, Andreas Olsen, Louis Brandt, Emma Bisgaard



### Description

Describe siUU

## 1 - Data Preprocessing

In [None]:
import numpy as np
import pandas as pd
import gzip
import json

1.1 Loading the Data

In [None]:
PATH = {'train':'../data/music_reviews_train.json.gz',
        'dev': '../data/music_reviews_dev.json.gz',
        'test': '../data/music_reviews_test_masked.json.gz'}

In [None]:
def load_data(path):
    '''
    Function to load the data from json.gz
    -----
    Takes in the argument: 
        'path' - takes the form PATH['(train, dev or test)']
    '''
    dic = {}
    for i, line in enumerate(gzip.open(path)):
        review_data = json.loads(line)
        dic[i] = {}
        for key,value in review_data.items():
            dic[i][key] = value
    return dic

In [None]:
train_data = load_data(PATH['train'])
dev_data = load_data(PATH['dev'])
test_data = load_data(PATH['test'])

1.2 Data Cleaning

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
def sent_encode(sent):
    '''
    Helper function to encode sentiment
    ------
    Takes in string description
        'sent' - either positive or negative
    Returns binary encoding
        1 = positive sentiment
        0 = negative sentiment
    '''
    if sent == 'positive':
        return 1
    if sent == 'negative':
        return 0 
    return '_'

In [None]:
def clean(data):
    '''
    Function to clean the data
    -----
    Takes in data set from load_data()
        'data' - nested dictionary  
    Returns two lists
        cleaned - X list of tuples (id,[text])
        ys - y list
    '''
    cleaned = [] 
    ys = []
    for idx in data:
        review = data[idx].get('reviewText', None) # some data does not have a review text
        summary = data[idx].get('summary', None) # some data does not have a summary 
        
        # combine summary and review
        if review == None and summary == None:
            text = ''
        elif review == None:
            text = summary
        elif summary == None:
            text = review
        else:
            text = summary + ' ' + review

        sequence = word_tokenize(text)  # splits gotta into got ta
        cleaned.append(sequence)

        # encode sentiment
        ys.append(sent_encode(data[idx]['sentiment']))

    return cleaned, ys

In [None]:
train_clean, y_train = clean(train_data)
dev_clean, y_dev = clean(dev_data)
test_clean, _ = clean(test_data)

In [None]:
# combine dev and train for cross validation
train_clean += dev_clean    # id for test set < 100000 
y_train += y_dev            # id for dev set > 100000

In [None]:
# most common tokens throughout data 
from collections import Counter
a = train_clean + test_clean
l = []
for x in a:
    for _ in x:
        l.append(_)
c = Counter(l)
c.most_common()

1.3 Generate Vocab, Corpus

In [None]:
def get_vocab_corpus(dataset):
    '''
    Function computing vocabluary and corpus for a dataset
    -----
    Takes a cleaned dataset - list 
        dataset - X list 
    Returns
        vocab - set of unique tokens in dataset
        corpus - list of strings; sentences in dataset 
    '''
    vocab = set()
    corpus = []
    for text in dataset: # for list in list of lists
        sentence = ''
        for token in text: # for token in list 
            vocab.add(token)
            if token in ['.','!','?',',',';',':']:
                sentence += token 
            else:
                sentence += ' ' + token 
        corpus.append(sentence.lstrip()) 
    return vocab, corpus

In [None]:
train_vocabulary, train_corpus = get_vocab_corpus(train_clean)
test_vocabulary, test_corpus = get_vocab_corpus(test_clean) # test vocab not used

1.4 Bag of Words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def get_bow(vocab, corp):
    '''
    Function returning sparse matrix of Term Frequency — Inverse Document Frequencies
    -----
    Takes vocab and corpus, working with two lists
        vocab - set of unique words
        corpus - list of strings
    Returns bag of words
        bow - 2d matrix; input to model
    '''
    vocab = list(vocab) 
    vectorizer = TfidfVectorizer(vocabulary= vocab)
    bow = vectorizer.fit_transform(corp) 
    return bow 

In [None]:
train_bow = get_bow(train_vocabulary,train_corpus)
test_bow = get_bow(train_vocabulary,test_corpus)

### 1.5 Generating Difficult Cases

In [None]:
# Phase 2 

## 2 - Baseline Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

2.1 Fit training data

In [None]:
lr = LogisticRegression()
parameters = {'max_iter':[100,500], 'C': [2,3,4,5]}
grid = GridSearchCV(lr, parameters)
grid.fit(train_bow, y_train)
grid.best_score_

2.3 Cross Validation

In [None]:
baseline_model = grid.best_estimator_
scores = cross_val_score(estimator= baseline_model,X= train_bow,y= y_train)
scores

2.4 Test Prediciton and Formatting

In [None]:
baseline_predictions = baseline_model.predict(test_bow)

In [None]:
def pred_test(test, ys):
    '''
    Function to insert predicitons into test data
    '''
    index = 0
    for key in test:
        test[key]['sentiment'] = reverse_encode(ys[index])
        index += 1
    return test

In [None]:
def reverse_encode(sent):
    if sent == 1:
        return 'positive'
    if sent == 0:
        return 'negative'

In [None]:
finished_test_data = pred_test(test_data,baseline_predictions)
finished_test_data

2.5 Write predicitons to file

In [None]:
# test_json=[json.dumps(i)+'\n' for i in finished_test_data.values()]
# with open ('music_reviews_test.json', 'w') as file:
# file.writelines(test_json)