In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.chdir("drive/MyDrive/data/hw09")

In [None]:
import numpy as np
import nltk
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential 
from keras.layers import Dropout, Dense
from keras import callbacks
nltk.download('punkt')
nltk.download('stopwords')
import gensim
from gensim.models import KeyedVectors
import io
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def data_reader(filename:str):
    """Read the movie reviews with the given filename."""
    df = pd.read_csv(filename,sep=",",header=0)
    df["sentiment"].apply(lambda x: 0 if x == "negative" else 1)
    
    return  df["review"].values.tolist(), df["sentiment"].to_numpy()

In [None]:
def load_fast_text_embeddings(filename: str):
    """Loads the FastText embeddings from the file with the given filename."""
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = [float(x) for x in tokens[1:]]
    
    # dimension of the vector 
    dim = len(list(data.values())[0] )
    return (data, dim)

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

def tokenize_sentences(sentences: [str]):
    """Tokenizes the given sentences"""
    result = []
    for sen in sentences:
        tokens =  word_tokenize(sen.lower())
        tokens_without_sw = [token for token in tokens if token not in stopwords.words() and token not in string.punctuation]
        result.append(tokens_without_sw)
    return result

In [None]:
def map_to_vectors(tokenized_sentences: [[str]]):
    """Maps the given tokenized sentences to lists of vectors."""
    
    model = load_fast_text_embeddings("embeddings/wiki-news-300d-10k.vec")
    result = np.zeros((len(tokenized_sentences),300))
    
    for i, sentence in enumerate(tokenized_sentences):
        sen_vec = np.zeros((len(sentence),300))
        for idx, word in enumerate(sentence):
            sen_vec[idx]= model[word]
        sen_vec = np.mean(sen_vec,axis=0)
        result[i]= sen_vec
    return result

In [None]:
# define a model
def MLP(input_shape: np.array):
    model = Sequential()
    model.add(Dense(100,activation='relu',input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(20,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation='softmax'))
    return model

In [None]:
reviews,sentiments=data_reader('data/IMDB.csv')
# convert numeric labels to one hot vectors
onehot_encoder = OneHotEncoder(sparse=False)
sentiments=sentiments.reshape(len(sentiments),1)
sentiments = onehot_encoder.fit_transform(sentiments)

In [None]:
token2vector, dimensions = load_fast_text_embeddings(filename="embeddings/wiki-news-300d-10k.vec")

In [None]:
tokenized_reviews = tokenize_sentences(reviews)
#visual inspection if a reasonable tokenization technique was applied for question (c)
print('First Review')
print(tokenized_reviews[0])
print('Second Review')
print(tokenized_reviews[5])
print('Third Review')
print(tokenized_reviews[100])

(1 Point) point for implementing tokenize_sentences

In [None]:
# should be written by tutors
point_for_c = 

In [None]:
embedded_reviews = map_to_vectors(tokenized_reviews)
#TODO
# split the vectorized reviews into train-, and testset
train_test_split = 0.8
train_x,train_y,test_x,test_y = train_test_split(embedded_reviews,sentiments, test_size = 1-train_test_split)
#ENDTODO


best_model = "model/best_model.ckpt"
# train the model and save the best one on dev set
cp_callback = callbacks.ModelCheckpoint(filepath=best_model, verbose=1, save_weights_only=True, monitor='val_acc', save_best_only=True)
model = MLP(input_shape=(dimensions,))
# use the binary cross entropy to measure the errors
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(train_x, train_y, epochs=20, batch_size=20, callbacks=[cp_callback], validation_split=0.2)

model.load_weights(best_model)
# calculate the accuracy on test set
_,acc = model.evaluate(test_x,test_y)
print('accuracy on test set:', acc)

predictions=model.predict(test_x)
# convert predictions to one hot vectors
predictions_oneHot = np.where(predictions > 0.5, 1, 0)
print(predictions_oneHot)

(e)(4 Points) Check the functionMLP(input_shape:np.array)that defines our multi-layer perceptron (MLP).

(i)(1 Point) How many layers does this model have (including the input and the output layer)?
Answer:

In [None]:
# should be written by tutors
point_for_e_i = 3

(ii) (1 Point) What is the size of the matrix that connects the input layer and the first hidden layer? 
Answer: [100,300]


In [None]:
# should be written by tutors
point_for_e_ii = 

(iii) (1 Point) How many units/neurons are in the output layer? Why? 
Answer: 2

In [None]:
# should be written by tutors
point_for_e_iii = 

(iv) (1 Point)  What is the meaning of Dropout? 
Answer: dropout refers to ignoring units during the training phase of certain set of neurons which is chosen at random

In [None]:
# should be written by tutors
point_for_e_iv = 

In [None]:
# the following codes are used for grading, students can simply ignore them, but please don't change/delete them
# any modifications to the code are seen to be cheating
import csv
assertions = dict()
with open('../assertions.csv',newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        assertions[row['assertion sequence']] = row['content']

total_points = 0
#==========================================================
# sanity check for question (a)
#==========================================================
try:
    assert(assertions['a_1'] in reviews)
    assert(sentiments[reviews.index(assertions['a_1'])][1]==1)
    total_points += 1
except AssertionError:
    print('errors in your implementation for question (a)')

try:
    assert(total_points <= 1)
except AssertionError:
    print('errors in calculating the points for question (a)')

#==========================================================
# sanity check for question (b)
#==========================================================
try:
    assert(dimensions == int(assertions['b_1']))
    assert(token2vector['article'][0] == float(assertions['b_2']))
    assert(token2vector['state'][2] == float(assertions['b_3']))
    total_points += 1
except AssertionError:
    print('errors in your implementation for question (b)')


try:
    assert(total_points <= 2)
except AssertionError:
    print('errors in calculating the points for question (b)')
    
#==========================================================
# sanity check for question (c)
#==========================================================
total_points += point_for_c 
try:
    assert(total_points <= 3)
except AssertionError:
    print('errors in calculating the points for question (c)')
    
#==========================================================
# sanity check for question (d)
#==========================================================
try:
    assert(len(embedded_reviews[0]) == int(assertions['d_1']))
    assert(len(embedded_reviews[100]) == int(assertions['d_2']))
    assert(len(embedded_reviews[600]) == int(assertions['d_3']))
    total_points += 1
except AssertionError:
    print('errors in your implementation for question (d)')

try:
    assert(total_points <= 4)
except AssertionError:
    print('errors in calculating the points for question (d)')

points_for_first_four_questions = total_points

#==========================================================
# sanity check for question e(i)
#==========================================================

total_points += point_for_e_i 
try:
    assert(total_points <= 5)
except AssertionError:
    print('errors in calculating the points for question e(i)')

#==========================================================
# sanity check for question e(ii)
#==========================================================
total_points += point_for_e_ii
try:
    assert(total_points <= 6)
except AssertionError:
    print('errors in calculating the points for question e(ii)')

#==========================================================
# sanity check for question e(iii)
#==========================================================
total_points += point_for_e_iii 
try:
    assert(total_points <= 7)
except AssertionError:
    print('errors in calculating the points for question e(iii)')

#==========================================================
# sanity check for question e(iv)
#==========================================================    
total_points += point_for_e_iv 
try:
    assert(total_points <= 8)
except AssertionError:
    print('errors in calculating the points for question e(iv)')


#==========================================================
# sanity check for question f
#==========================================================
point_for_f = 0
try:
    assert(len(train_x) > len(test_x))
    assert(len(train_y) > len(test_y))
    assert(len(train_x)+len(test_x)==len(embedded_reviews))
    assert(len(train_y)+len(test_y)==len(sentiments))
    point_for_f = 2
    total_points += point_for_f
except AssertionError:
    print('errors in your implementation for question (f)')

try:
    assert(total_points <= 10)
except AssertionError:
    print('errors in calculating the points for question (f)')

  

In [None]:
print("points for the first four questions: ",points_for_first_four_questions)
print("point for e(i): ", point_for_e_i)
print("point for e(ii): ", point_for_e_ii)
print("point for e(iii): ", point_for_e_iii)
print("point for e(iv): ", point_for_e_iv)
print("point for f: ", point_for_f)
print("total points: ", total_points)

In [None]:
# the following codes can be run if you want to save the result of all grading to a .csv file
# it can be run on the window system, but if you use colab, you need to at first locate to your current .ipynb file (cd xxx/xxx)
import os
# for windows users
filename = os.getcwd().split('\\')[-1]
#filename = os.getcwd().split('/')[-1]

In [None]:
# filename = 

In [None]:
# do you have any feedback
feedback = ""

In [None]:
name = filename.split('_')[0]
matrikelnr= filename.split('_')[1]
#feedback += ' '.join([item for item in comments if len(item.strip())>0])
with open('../grading_for_HW9.csv','a+',newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([name,matrikelnr,total_points,feedback])