# **Module 5: Natural language processing**
## DAT410

### Group 29 
### David Laessker, 980511-5012, laessker@chalmers.se

### Oskar Palmgren, 010529-4714, oskarpal@chalmers.se



We hereby declare that we have both actively participated in solving every exercise. All solutions are entirely our own work, without having taken part of other solutions.

___


## 1) Reading and reflection

a) Like speech recognition and image recognition?

b) Systems that are rule-based explicitly use linguistic rules and dictionaries, while neural systems learn these linguistic patterns from large datasets. Both approaches aim to accurately translate languages by mapping structures and meanings, but through different means.

c) Maybe smaller datasets? Modern neural systems may not capture the grammatical patterns in the language with scarce data. A rule based system will therefore offer more predictable and interpretable results.

## 2) Implementation

In [81]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

In [92]:
swe_eng_file_path = 'data/europarl-v7.sv-en.lc.sv'
eng_swe_file_path = 'data\europarl-v7.sv-en.lc.en'

ger_eng_file_path = 'data\europarl-v7.de-en.lc.de'
eng_ger_file_path = 'data\europarl-v7.de-en.lc.en'

fre_eng_file_path = 'data\europarl-v7.fr-en.lc.fr'
eng_fre_file_path = 'data\europarl-v7.fr-en.lc.en'

### (a) Warmup

In [114]:
def word_frequency(file):
    
    word_counter = Counter()

    with open(file, 'r') as f:
    
        for line in f:
            
            words = line.strip().split()
            word_counter.update(words)
    
    return word_counter

In [115]:
swe_frequency = word_frequency(swe_eng_file_path)

swe_top_10 = swe_frequency.most_common(10)

swe_top_10


[('.', 9648),
 ('att', 9181),
 (',', 8876),
 ('och', 7038),
 ('i', 5949),
 ('det', 5687),
 ('som', 5028),
 ('för', 4959),
 ('av', 4013),
 ('är', 3840)]

In [116]:
eng_frequency1 = word_frequency(eng_swe_file_path)
eng_frequency2 = word_frequency(eng_ger_file_path)
eng_frequency3 = word_frequency(eng_fre_file_path)

eng_total_frequency = eng_frequency1 + eng_frequency2 + eng_frequency3

eng_top_10 = eng_total_frequency.most_common(10)

eng_top_10


[('the', 58790),
 (',', 42043),
 ('.', 29542),
 ('of', 28406),
 ('to', 26842),
 ('and', 21459),
 ('in', 18485),
 ('is', 13331),
 ('that', 13219),
 ('a', 13090)]

In [117]:
ger_frequency = word_frequency(ger_eng_file_path)

ger_top_10 = ger_frequency.most_common(10)

ger_top_10

[(',', 18549),
 ('die', 10521),
 ('.', 9733),
 ('der', 9374),
 ('und', 7028),
 ('in', 4175),
 ('zu', 3168),
 ('den', 2976),
 ('wir', 2863),
 ('daß', 2738)]

In [118]:
fre_frequency = word_frequency(fre_eng_file_path)

fre_top_10 = fre_frequency.most_common(10)

fre_top_10

[('&apos;', 16729),
 (',', 15402),
 ('de', 14520),
 ('la', 9746),
 ('.', 9734),
 ('et', 6619),
 ('l', 6536),
 ('le', 6174),
 ('les', 5585),
 ('à', 5500)]

**need to remove punctuations, other symbols such as apostophes?**

In [127]:
eur_parl_frequency = swe_frequency + fre_frequency + ger_frequency + eng_total_frequency

words_amount = sum(eur_parl_frequency.values())

#print(words_amount)
#print(eur_parl_frequency['speaker'])
#print(eur_parl_frequency['zebra'])

speaker_probability = (eur_parl_frequency['speaker'] / words_amount ) * 100
zebra_probability = (eur_parl_frequency['zebra'] / words_amount ) * 100

# calculate probability for "speaker" and "zebra"
print(f'Probability of speaker: {speaker_probability:.5f} %')
print(f'Probability of zebra: {zebra_probability} %')


Probability of speaker: 0.00193 %
Probability of zebra: 0.0 %


### (b) Language modeling

In [120]:
def read_file(file):
    '''
    Reads the file, each sentence line by line and splits the words from the sentences. 
    Returns a list of lists with words from each sentence
    '''
    
    sentences_list = []

    with open(file, 'r', encoding='utf-8') as f:
    
        for line in f:
            
            words = line.strip().split()
            sentences_list.append(words)
    
    return sentences_list


In [69]:
import random
from collections import defaultdict, Counter

class BigramModel:
    def __init__(self):
        self.bigram_counts = defaultdict(Counter)
        self.starting_words = []

    def train(self, sentence_list):
        # Preprocess the text into words

        for sentence in sentence_list:
            self.starting_words.append(sentence[0])
        
        # Count bigrams in the text
            for i in range(len(sentence) - 1):
                self.bigram_counts[sentence[i]][sentence[i+1]] += 1
        

    def predict_next_word(self, word):
        if word not in self.bigram_counts:
            return None
        next_words = self.bigram_counts[word]
        total_counts = sum(next_words.values())
        # Create a weighted choice among the next possible words
        weighted_choices = [(w, count / total_counts) for w, count in next_words.items()]
        return random.choices([w for w, _ in weighted_choices], [count for _, count in weighted_choices])[0]

    def generate_text(self, start_word, length=10):
        
        #print(self.starting_words)
        #print(self.bigram_counts['jag'])


        if start_word.lower() not in self.bigram_counts and not self.starting_words:
            return "Model not trained or start word not in corpus."
        
        if start_word.lower() in self.bigram_counts:
            current_word = start_word.lower()

        else:
            current_word = random.choice(self.starting_words)
        
        
        generated_text = [current_word]
        
        for _ in range(length - 1):
            next_word = self.predict_next_word(current_word)
            if next_word is None:
                break  # End if no next word is found
            generated_text.append(next_word)
            current_word = next_word
        return ' '.join(generated_text)





In [123]:
swe_sentences = read_file(swe_eng_file_path)

model = BigramModel()
model.train(swe_sentences)

start_word = "jag"
generated_text = model.generate_text(start_word, 10)
print(generated_text)


jag väntar också gör att skapa en fråga som orienterar


In [130]:


def createBigram(data):
   listOfBigrams = []
   bigramCounts = {}
   unigramCounts = {}
   for sentence in data:
      for i in range(len(sentence)-1):
         if i < len(sentence) - 1 and sentence[i+1].islower():

            listOfBigrams.append((sentence[i], sentence[i + 1]))

            if (sentence[i], sentence[i+1]) in bigramCounts:
               bigramCounts[(sentence[i], sentence[i + 1])] += 1
            else:
               bigramCounts[(sentence[i], sentence[i + 1])] = 1

         if sentence[i] in unigramCounts:
            unigramCounts[sentence[i]] += 1
         else:
            unigramCounts[sentence[i]] = 1
   return listOfBigrams, unigramCounts, bigramCounts


def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))
    return listOfProb
