In [None]:
%%HTML
<style> td {font-size: 18px} </style>
<style> tr {font-size: 18px} </style>
<style> li {font-size: 18px} </style>

In [None]:
from IPython.core.display import display, HTML
from utils import load_data, print_data_stats, subset_data
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Introduction to the dataset
- What is Sentiment Analysis?
    - "I like this movie" --> positive
    - "I hate this movie" --> negative
- [ABSA multilingual SA dataset](http://alt.qcri.org/semeval2016/task5/)
- What kind of preprocessing can you think of from the examples below

In [None]:
LANGS = ["ar","en","es","ru","zh"]
LANGS_MAPPING = {"en":"english","es":"spanish","ru":"russian","ar":"arabic","zh":"chinese"}

data = load_data()

In [None]:
print_data_stats(data, max_len=40)

- re-sample the data to make all languages to have the same number of training data

In [None]:
data_sampled = subset_data(data)
print_data_stats(data_sampled,40)

# Activity: build your own dataset (20 for train, 5 for test)
- try to use similar words as much as possible
- try to make some word overlaps between examples

In [None]:
NEW_LANG = "?"
train_pos_sents = ["I like this movie","Ihe nkiri a masịrị m", "Ninapenda sinema hii"]
train_neg_sents = ["I have this movie"]
test_pos_sents = ["I enjoyed the movie"]
test_neg_sents = ["Never watch it"]

data[NEW_LANG] = {}
data[NEW_LANG]["train"] = [(sent,"pos") for sent in train_pos_sents] + [(sent,"neg") for sent in train_neg_sents]
data[NEW_LANG]["test"] = [(sent,"pos") for sent in train_pos_sents] + [(sent,"neg") for sent in train_neg_sents]

# Load stemmers, word_tokenizers, stopword_filters
- **stemming/lemmatization**: reducing inflected (or sometimes derived) words to their word stem
- **word segmentation (tokenization)**: dividing a string of written language into its component words
- **stopwords**: a set of commonly used words

In [None]:
import Stemmer
import stopwordsiso as stopwordsiso
import jieba
from pyarabic import araby 
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import stopwordsiso

class MultiStopword:
    def __init__(self):
        self.stopwords = {}
        for lang in ["en","es","ar","ru"]:
            self.stopwords[lang] = set(stopwords.words(LANGS_MAPPING[lang]))
        for lang in ["zh"]:
            self.stopwords[lang] = stopwordsiso.stopwords(lang) 
        
        ## TODO
        self.stopwords[NEW_LANG] = set([""])
            
    def is_stopword(self, word,lang):
        if lang in self.stopwords:
            return (word in self.stopwords[lang])
        else:
            raise NotImplementedError

class MultiWordSegmenter:
    def __init__(self):
        self.tokenizer = {}
        self.tokenizer["ru"] = ToktokTokenizer()

    def segment(self, text, lang):
        if lang in ["en","es"]:
            return word_tokenize(text, language=LANGS_MAPPING[lang])
        elif lang == "zh":
            return jieba.cut(text)
        elif lang == "ru":
            return self.tokenizer["ru"].tokenize(text)
        elif lang == "ar":
            return araby.tokenize(text)
        ## TODO
        elif lang == NEW_LANG:
            return word_tokenize(text)
        else:
            raise NotImplementedError

class MultiWordStemmers:
    def __init__(self):
        self.stemmers = {}
        self.stemmers["en"] = Stemmer.Stemmer('english')
        self.stemmers["ar"] = Stemmer.Stemmer('arabic')
        self.stemmers["ru"] = Stemmer.Stemmer('russian')
        self.stemmers["es"] = Stemmer.Stemmer('spanish')

    def stem(self, word, lang):
        if lang in self.stemmers:
            return self.stemmers[lang].stemWord(word)
        elif lang == "zh":
            return word
        elif lang == NEW_LANG:
            ## TODO
            return word
        else:
            raise NotImplementedError

stopword_checkers = MultiStopword()
word_segmenters = MultiWordSegmenter()
stemmers = MultiWordStemmers()

- **Examples:**

In [None]:
print(stemmers.stem("friend","en"))
print(stemmers.stem("friends","en"))
print(stemmers.stem("friended","en"))

In [None]:
# russian verbs for MUST
print(stemmers.stem("должен","ru")) # Male
print(stemmers.stem("должна","ru")) # Female
print(stemmers.stem("должно","ru")) # Neutral
print(stemmers.stem("должны","ru")) # Plural

In [None]:
def preprocessing_example(sentence, lang):
    print("\n".join([str((stemmers.stem(w,lang), stopword_checkers.is_stopword(w,lang))) for w in word_segmenters.segment(sentence,lang)]))
    print(" ".join([w for w in word_segmenters.segment(sentence,lang) if not stopword_checkers.is_stopword(w,lang)]))

ex_sentence = "Mr.Brown measured the cat this morning, and it was 14.5 pounds!"
preprocessing_example(ex_sentence, "en")

# Activity: add stemmers, word_tokenizers, stopwords for your language
- find and edit `## TODO`

In [None]:
# and test out!
ex_sentence = "your sentence!"
preprocessing_example(ex_sentence, NEW_LANG)

# Activity 2: reduce the number of features (# of unigrams)

In [None]:
baseline = {}
vectorizer = CountVectorizer(ngram_range=(1, 1), lowercase=False)

for lang in data_sampled.keys():
    sentences_train, y_train = zip(*data_sampled[lang]["train"])
    vectorizer.fit(sentences_train)
    num_unigram = len(vectorizer.get_feature_names())
    baseline[lang] = num_unigram
    print(lang, num_unigram)

In [None]:
def preprocess(sentence_list:list, lang:str) -> list:
    return [preprocess_sentence(sentence, lang) for sentence in sentence_list]

def preprocess_sentence(text:str, lang:str) -> str:
    text = text.lower()
    ## TODO
    words = text.split()
#     words = word_segmenters.segm2enot stopword_checkers.is_stopword(w, lang)]
    text = " ".join(words)
    return text

vectorizer = CountVectorizer(ngram_range=(1, 1), lowercase=False)
for lang in LANGS:
    sentences_train, y_train = zip(*data_sampled[lang]["train"])
    sentences_train = preprocess(sentences_train, lang)
    vectorizer.fit(sentences_train)
    num_unigram = len(vectorizer.get_feature_names())
    print(f"{lang}: {num_unigram:<5}({baseline[lang]-num_unigram}\u2193)")

# Train Naive Bayes models

In [None]:
def preprocess(sentence_list:list, lang:str, bool_lowercase=True, bool_segment=True, bool_stem=True, filter_stopwords=False) -> list:
    return [preprocess_sentence(sentence, lang, bool_lowercase, bool_segment, bool_stem, filter_stopwords) for sentence in sentence_list]

def preprocess_sentence(text:str, lang:str, bool_lowercase, bool_segment, bool_stem, filter_stopwords) -> str:
    if bool_lowercase:
        text = text.lower()

    if bool_segment:
        words = word_segmenters.segment(text, lang)
    else:
        words = text.split()

    if bool_stem:
        words = [stemmers.stem(w, lang) for w in words]
    
    if filter_stopwords:
        words = [w for w in words if not stopword_checkers.is_stopword(w, lang)]

    return " ".join(words)

def train_and_evaluate_nb(data:dict, lang:str, max_feat=100) -> float:
    sentences_train, y_train = zip(*data[lang]["train"])
    sentences_test, y_test = zip(*data[lang]["test"])
    
    sentences_train, sentences_test = preprocess(sentences_train, lang), preprocess(sentences_test, lang)
    vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=max_feat, lowercase=False)
    x_train = vectorizer.fit_transform(sentences_train)
    x_test = vectorizer.transform(sentences_test)    
    
    model = MultinomialNB()
    model.fit(x_train, y_train)
    acc = model.score(x_test, y_test)
    print(f"{lang}: {acc:.2f}")
    return {"model":model, "vectorizer":vectorizer}

def predict(models, lang, sents):
    model, vectorizer = models[lang]["model"], models[lang]["vectorizer"]
    if type(sents) == str:
        sents = [sents]
    sents = preprocess(sents, lang)
    x = vectorizer.transform(sents)
    pred = model.predict(x)
    print(list(zip(sents,pred)))

In [None]:
models = {}
for lang in data.keys():
    models[lang] = train_and_evaluate_nb(data, lang)

In [None]:
ex_sents = ["will watch it again","will not watch it again"]
print(models["en"]["vectorizer"].transform(ex_sents))
predict(models,"en",ex_sents)

# Activity 3: Fill out the following table
- change boolean arguments in `preprocess()`

|                          	| Ar 	| En 	| Es 	| Ru 	| Zh 	| NEW_LANG 	|
|--------------------------	|----	|:--:	|:--:	|----	|----	|----------	|
| Baseline                 	|    	|    	|    	|    	|    	|          	|
| All                      	|    	|    	|    	|    	|    	|          	|
| All - segmentation       	|    	|    	|    	|    	|    	|          	|
| All - stemmer            	|    	|    	|    	|    	|    	|          	|
| All - stopword_filtering 	|    	|    	|    	|    	|    	|          	|

# Activity 4: Explain your own observations

- observation 1:
- observation 2: