In [1]:
import numpy as np
import pandas as pd
import matplotlib as matplot
import nltk
import sklearn

In [2]:
en_df = pd.read_csv('data/CONcreTEXT_trial_EN.tsv', sep='\t') # load data files
it_df = pd.read_csv('data/CONcreTEXT_trial_IT.tsv', sep='\t')

## Question 1

In [3]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
en_df['TOKENS'] = en_df['TEXT'].apply(lambda x: x.lower()).apply(tokenizer.tokenize)
it_df['TOKENS'] = it_df['TEXT'].apply(lambda x: x.lower()).apply(tokenizer.tokenize)

en_words = [word for sentence in en_df['TOKENS'] for word in sentence]
it_words = [word for sentence in it_df['TOKENS'] for word in sentence]

print('English Dictionary Size: %s\nItalian Dictionary Size: %d' % (len(en_words), len(it_words)))

English Dictionary Size: 1314
Italian Dictionary Size: 1306


In [4]:
en_train = []
en_test = []

it_train = []
it_test = []

state = 4111 # consistent state for testing, used 3951, 4111, 0, 42 for different states.
en_train, en_test = sklearn.model_selection.train_test_split(en_words, train_size=0.8, test_size=0.2, random_state=state)
it_train, it_test = sklearn.model_selection.train_test_split(it_words, train_size=0.8, test_size=0.2, random_state=state)

## Question 2

*Notes on processing*: All words have been converted to lowercase, and I think that minimizing the "randomness" of having some words be capitalized and others be lowercase will help accuracy. I also did not use START and STOP sequences as each letter is being analyzed individually, so START and STOP would have no impact.

In [5]:
# English unigrams
en_unigrams = []
for word in en_train:
    unigram = [character for character in word]
    en_unigrams.append(unigram)
    
# Italian unigrams
it_unigrams = []
for word in it_train:
    unigram = [character for character in word]
    it_unigrams.append(unigram)

In [6]:
# calculate distribution of letters
en_unifreqdist = nltk.FreqDist()
for unigram in en_unigrams:
    en_unifreqdist += nltk.FreqDist(unigram)
    
it_unifreqdist = nltk.FreqDist()
for unigram in it_unigrams:
    it_unifreqdist += nltk.FreqDist(unigram)

In [7]:
en_test_df = pd.DataFrame(en_test, columns=['Word'])

In [8]:
en_probabilities = []
it_probabilities = []
for index in en_test_df.index:
    characters = [characters for characters in en_test_df.loc[index]['Word']]
    
    en_probability = 1
    it_probability = 1
    
    for character in characters:
        en_probability *= en_unifreqdist.freq(character)
        it_probability *= it_unifreqdist.freq(character)

    en_probabilities.append(en_probability)
    it_probabilities.append(it_probability)
    
en_test_df['English Unigram'] = en_probabilities
en_test_df['Italian Unigram'] = it_probabilities

In [9]:
unicorrect = 0
classification = []
for index in en_test_df.index:
    if en_test_df.loc[index]['English Unigram'] > en_test_df.loc[index]['Italian Unigram']:
        unicorrect += 1
        classification.append("English")
    else:
        classification.append("Italian")

en_test_df['Unigram'] = classification
print("Accuracy: {:.3%} over a test size of {}".format(unicorrect/len(en_test_df), len(en_test_df)))
unigram_accuracy = unicorrect/len(en_test_df)

Accuracy: 60.837% over a test size of 263


## Question 3

When constructing bigrams, I chose to use the nltk.bigrams() function as a helper tool. I didn't want to reinvent the wheel. I inserted START and STOP codes at the appropriate places. Additionally, we only construct bigrams for words that are longer than length 1.

In [10]:
# Start and stop sequences
START = "START"
STOP = "STOP"

In [11]:
en_bigrams = []
for word in en_train:
    if len(word) > 1:
        bg = list(nltk.bigrams(word))
        bg.insert(0, (START, bg[0][0]))
        bg.append((bg[len(bg)-1][1], STOP))

        for bigram in bg:
            en_bigrams.append(bigram)

            
it_bigrams = []
for word in it_train:
    if len(word) > 1:
        bg = list(nltk.bigrams(word))
        bg.insert(0, (START, bg[0][0]))
        bg.append((bg[len(bg)-1][1], STOP))

        for bigram in bg:
            it_bigrams.append(bigram)


In [12]:
# calculate distribution of letters
en_conditional_freqdist = nltk.ConditionalFreqDist( (preceeding, word) for (preceeding, word) in en_bigrams)
it_conditional_freqdist = nltk.ConditionalFreqDist( (preceeding, word) for (preceeding, word) in it_bigrams)

In [13]:
en_probabilities = []
it_probabilities = []
for index in en_test_df.index:
    
    en_probability = 0
    it_probability = 0
    
    if len(en_test_df.loc[index]['Word']) > 1:
        bigrams = list(nltk.bigrams(en_test_df.loc[index]['Word']))

        en_probability = 1
        it_probability = 1
        
        for bigram in bigrams:
            (preceeding, word) = bigram
            en_probability *= en_conditional_freqdist[preceeding].freq(word)
            it_probability *= it_conditional_freqdist[preceeding].freq(word)

    else:
        en_probability = -1
        it_probability = -1
    
    en_probabilities.append(en_probability)
    it_probabilities.append(it_probability)


en_test_df['English Bigram'] = en_probabilities
en_test_df['Italian Bigram'] = it_probabilities

In [14]:
bicorrect = 0
classification = []
for index in en_test_df.index:
    if en_test_df.loc[index]['English Bigram'] > en_test_df.loc[index]['Italian Bigram']:
        bicorrect += 1
        classification.append("English")
    else:
        classification.append("Italian")

en_test_df['Bigram'] = classification
print("Accuracy: {:.3%} over a test size of {}".format(bicorrect/len(en_test_df), len(en_test_df)))
bigram_accuracy = bicorrect/len(en_test_df)

Accuracy: 77.186% over a test size of 263


### Question 3 Discussion

The accuracy for a bigram model was SIGNIFICANTLY higher than a unigram model. Refer to comparison below. For some random states when splitting the train/test groups, we only see a ~5-7% increase in accuracy, but for a random state of 4111 (course code!) we see a whopping ~17% increase.

Therefore, we can say that a bigram character-level language model is better at distinguishing langage than a unigram character-level language model. 

This is pretty clear why. Having more context for the relationship of the letter to it's predecessor means that we can see if a word contains a certain combination that is unlikely to appear in Italian, but very common in English. This can be attributed to the evolution of languages, as over time languages diverge and develop seperately, and the bigram model (and possibly trigram model) allows for us to see some differences.

In a unigram model, we're reaching into a scrabble bag and just picking letters. This only lets us look at raw frequency. It is not sufficient to determine language.

In [15]:
print("Accuracy for unigram model:\t{:.3%}\nAccuracy for bigram model:\t{:.3%}".format(unigram_accuracy, bigram_accuracy))

Accuracy for unigram model:	60.837%
Accuracy for bigram model:	77.186%
