In [1]:
import numpy as np
import pandas as pd
import matplotlib as matplot
import nltk

# Question 1

In [2]:
en_df = pd.read_csv('data/CONcreTEXT_trial_EN.tsv', sep='\t') # load data files
it_df = pd.read_csv('data/CONcreTEXT_trial_IT.tsv', sep='\t')

In [3]:
en_df['LANGUAGE'] = ['ENGLISH'] * en_df.shape[0]
it_df['LANGUAGE'] = ['ITALIAN'] * it_df.shape[0]

In [4]:
df = pd.concat([en_df, it_df])
df = df.reset_index(0)
df

Unnamed: 0,index,TARGET,POS,INDEX,TEXT,MEAN,LANGUAGE
0,0,achievement,N,3,"Bring up academic achievements , awards , and ...",3.06,ENGLISH
1,1,achievement,N,9,"Please list people you have helped , your pers...",3.03,ENGLISH
2,2,activate,V,1,Add activated carbon straight to your vodka .,3.83,ENGLISH
3,3,activate,V,15,"Place sensors around your garden , and when a ...",5.51,ENGLISH
4,4,adventure,N,9,Look for a partner that shares your level of a...,2.03,ENGLISH
...,...,...,...,...,...,...,...
195,95,verità,N,8,"In un modo o nell' altro , la verità viene sem...",2.53,ITALIAN
196,96,viaggio,N,2,Organizza dei viaggi nel fine settimana quando...,5.03,ITALIAN
197,97,viaggio,N,6,Pesa le tue valigie prima del viaggio per evit...,4.84,ITALIAN
198,98,vista,N,6,è molto importante non perdere di vista la pro...,2.22,ITALIAN


# Question 2

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()

In [6]:
counts = count_vectorizer.fit_transform(df['TEXT'])
counts.shape

(200, 1330)

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False).fit(counts)

In [8]:
tf = tf_transformer.transform(counts)
tf.shape

(200, 1330)

# Question 3 & 4

In [9]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(tf, df['LANGUAGE'])

In [10]:
docs_new = ['Why does a rose smell sweet?', 'Pensa ai tuoi sentimenti di amore.']

In [11]:
docs_counts = count_vectorizer.transform(docs_new)
docs_tfidf = tf_transformer.transform(docs_counts)

In [12]:
predictions = clf.predict(docs_tfidf)

for sentence, language in zip(docs_new, predictions):
    print(sentence + " => " + language)

Why does a rose smell sweet? => ENGLISH
Pensa ai tuoi sentimenti di amore. => ITALIAN


# Question 5

In [13]:
sentences = [
    'Leonardo DaVinci was a prolific artist.',
    'Amo il cibo italiano.',
    'Quando mangeremo',
    'I enjoy leisurely strolls around Rome',
    'Grazie per la buona recensione'
]

In [14]:
sentences_counts = count_vectorizer.transform(sentences)
sentences_tfidf = tf_transformer.transform(sentences_counts)

In [15]:
predictions = clf.predict(sentences_tfidf)

for sentence, language in zip(sentences, predictions):
    print(sentence + " => " + language)

Leonardo DaVinci was a prolific artist. => ENGLISH
Amo il cibo italiano. => ITALIAN
Quando mangeremo => ITALIAN
I enjoy leisurely strolls around Rome => ENGLISH
Grazie per la buona recensione => ITALIAN


# Extra Credit
### Strategy
Cognates are words that are similar between two langauges. By abusing these words, we can have a sentence that is valid only in X language, but our model confuses with Y language. I believe the best way to do this is to ensure that the cognate has at most 1 edit distance between languages.

In [16]:
cognate_sentences = [ 
    'Individuo credibile', # Italian, but has very similar wor
]

In [17]:
cog_counts = count_vectorizer.transform(cognate_sentences)
cog_tfidf = tf_transformer.transform(cog_counts)

In [18]:
predictions = clf.predict(cog_tfidf)

for sentence, language in zip(cognate_sentences, predictions):
    print(sentence + " => " + language)

Individuo credibile => ENGLISH
