# Bag of Words with SVM Model
This notebook documents the implementation of the Bag-of-Words Model and Support Vector Machine for the English-German task.

## 1. Importing libraries
In this section, we import the nltk, numpy, re and heapq to use in our implementation later on.

In [0]:
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import re
import heapq
stop_words_en = set(stopwords.words('english'))
stop_words_de = set(stopwords.words('german'))

## 2. Defining functions


### 2.1 Read File
In this function, we write a function that take in a file as input and output the text in the file.

In [0]:
def read_file(file):
    f = open(file,encoding="utf8")
    text = f.readlines()
    return text

### 2.2 Pre-Processing
In this section, we do 3 types of pre-processing on the text:


1.   Removal of stop words
2.   Removal of punctuation
3.   Conversion to lowercase





In [0]:
def process_text(file,lang):
    text = read_file(file)
    for i in range(len(text)):
      # Conversion to lowercase
      text[i] = text[i].lower()
      # Removal of punctuation
      text[i] = re.sub(r'\W',' ',text[i])
      text[i] = re.sub(r'\s+',' ',text[i])
      tokens = nltk.word_tokenize(text[i])
      # Removal of stop words
      if lang == 'en':
          text[i] = [w for w in tokens if not w in stop_words_en]
      elif lang == 'de':
          text[i] = [w for w in tokens if not w in stop_words_de]
    cleaned_text = text
    return cleaned_text

### 2.3 Build Vocabulary of Known Words
For this function, we build the vocabulary of known words from the corpus needed for the Bag-of-Words model.

In [0]:
def build_vocab(file, lang):
    cleaned_text = process_text(file,lang)
    wordfreq = {}
    for sentence in cleaned_text:
        for token in sentence:
            if token not in wordfreq.keys():
                wordfreq[token] = 1
            else:
                wordfreq[token] += 1
    vocab_list = heapq.nlargest(400, wordfreq, key=wordfreq.get)
    return vocab_list

### 2.4 Vectorise Sentence
In this function, we convert the sentences into vector form using the vocabulary of known words

In [0]:
def convert_to_vec(file,wordfreqlist,lang):
    cleaned_text = process_text(file, lang)
    sentence_vectors = []
    for sentence in cleaned_text:
        sent_vec = []
        for token in wordfreqlist:
            if token in sentence:
                sent_vec.append(1)
            else:
                sent_vec.append(0)
        sentence_vectors.append(sent_vec)
    sentence_vectors = np.asarray(sentence_vectors)
    return sentence_vectors

## 3. Getting Bag-of-Words

In [0]:
# Get stopwords from nltk package
stop_words_en = set(stopwords.words('english'))
stop_words_de = set(stopwords.words('german'))

# Get bag of words for English
bow_en = build_vocab("./train.ende.src", "en")

# Get bag of words for German
bow_de = build_vocab("./train.ende.mt" , "de")

## 4. Getting Training and Validation Sets

In [0]:
# Training Set - X 
de_train_src = convert_to_vec("./train.ende.src",bow_en, "en")
de_train_mt = convert_to_vec("./train.ende.mt",bow_de,  "de")

X_train_de = np.concatenate((np.array(de_train_src), np.array(de_train_mt)), 1)

# Validation Set - X
de_val_src = convert_to_vec("./dev.ende.src",bow_en, "en")
de_val_mt = convert_to_vec("./dev.ende.mt",bow_de,  "de")

X_val_de = np.concatenate((np.array(de_val_src), np.array(de_val_mt)), 1)

# Combine Training and Validation Set - X
X = np.concatenate((X_train_de, X_val_de), axis=0)

# Training Set - y labels
de_train_scores = read_file("./train.ende.scores")
train_scores = np.array(de_train_scores).astype(float)
y_train_de = train_scores

# Validation Set - y labels
de_val_scores = read_file("./dev.ende.scores")
val_scores = np.array(de_val_scores).astype(float)
y_val_de = val_scores

# Combine Training and Validation Set - y labels
y = np.concatenate((y_train_de, y_val_de), axis=0)

## 5. Training the Regressor
First, define the RMSE function

In [0]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

### 5.1 SVM
Cross-validation is used to split the training and validation set into 8 folds and the model is then trained on these 8 folds. The one with the highest average Pearson correlation value corresponds to the best type of SVM. 

In [0]:
from sklearn.svm import SVR
from scipy.stats.stats import pearsonr
from sklearn.model_selection import KFold
import pickle

model = [0,0,0,0]
for i, k in enumerate(['linear','poly','rbf','sigmoid']):
  # split into 8 folds 
  kf = KFold(n_splits=8, shuffle=True, random_state=42)
  pearson_arr = []
  print(k)
  LOCAL_MAX = -1
  for train_index, val_index in kf.split(X):
      X_train_de, X_val_de = X[train_index], X[val_index]
      y_train_de, y_val_de = y[train_index], y[val_index]
      clf_t = SVR(kernel=k,gamma='auto')
      clf_t.fit(X_train_de, y_train_de)
      predictions = clf_t.predict(X_val_de)
      pearson = pearsonr(y_val_de, predictions)
      # check if pearson value exceeds LOCAL_MAX - if exceeds, replace model with the current model and set the new LOCAL_MAX
      if pearson[0] > LOCAL_MAX:
          model[i] = clf_t
          LOCAL_MAX = pearson[0]
      pearson_arr.append(pearson[0])
      print(f'RMSE: {rmse(predictions,y_val_de)} Pearson {pearson[0]}')

  print(f'Average Pearson: {np.mean(pearson_arr)}')
  print()

## 6. Getting the test set predictions
Based on the model, the highest pearson correlation was for the linear function so we will use the linear model

In [0]:
# Test Set - X
de_test_src = convert_to_vec("./test.ende.src",bow_en,'en')
de_test_mt = convert_to_vec("./test.ende.mt",bow_de,'de')

X_test_de = np.concatenate((np.array(de_test_src), np.array(de_test_mt)), 1)

#Predict with best linear model
clf_de = model[0]
predictions_de = clf_de.predict(X_test_de)

## 7. Writing to File

In [0]:
import os

def writeScores(method_name,scores):
  fn = "predictions.txt"
  print("")
  with open(fn, 'w') as output_file:
    for idx,x in enumerate(scores):
      output_file.write(f"{x}\n")

In [0]:
# Write the prediction score into a txt file
writeScores("SVR",predictions_de)