In [None]:
#create category labels from the category_fname file. 
#Returns the mapping between category-to-id in processed_tweets.json
#used for BASELINE only
def create_category_labels(category_fname, data_dir_path):
    extra_cat_added = []
    category_fname_path = os.path.abspath(os.path.join(data_dir_path, category_fname))
    #create the id-to-category map
    id_category_map={}
    with open(category_fname_path,'rb') as tsvfile:
        tsvin = csv.reader(tsvfile, delimiter='\t')
        for row in tsvin:
            if row[0] not in category_labels:
                category_labels.append(row[0])
                extra_cat_added.append(row[0])
            id_category_map[row[1]] = row[0]
    
    #print category statistics
    #print("These additional categories were found in the dataset:")
    #print(extra_cat_added)  
    print("Total number of categories now is:",len(category_labels))
    return id_category_map 

#returns a dict to store the corpus for every language <lang_code, entire_text_corpus>
#used for BASELINE only
def create_corpus_for_languages(data_frame):
    language_corpus_map={}
    for index, row in data_frame.iterrows():
        if row["label"] in language_corpus_map:
            language_corpus_map[row["label"]] = language_corpus_map[row["label"]] + " " +row["content"]
        else:
             language_corpus_map[row["label"]] = row["content"]
    return language_corpus_map

In [None]:
'''
Step 3: Baseline
'''
#BASELINE 1: classify the language if it contains the most frequent word for a language
#If it contains words from multiple languages, 
#then use a tie breaking mechanism to classify it into that language for which 
#it has the highest frequency of the most freq word
def baseline_1(data_frame, language_corpus_map, category_labels):
    feature_map = []
    for key in language_corpus_map:
        new_feature = get_topk_freq_words(language_corpus_map[key],1)
        k=2
        while new_feature in feature_map:
            prev_lang_index = feature_map.index(new_feature)
            print(new_feature," already in feature map for k= ",k, " key=",key," old_lang=",category_labels[prev_lang_index])
            features_prev_lang = get_topk_freq_words(language_corpus_map[category_labels[prev_lang_index]], k)
            new_feature_prev_lang = features_prev_lang[k-1]  if len(features_prev_lang)>=k else features_prev_lang[len(features_prev_lang)-1]
            feature_map[prev_lang_index] = new_feature_prev_lang
            
            features_new_lang = get_topk_freq_words(language_corpus_map[key], k)
            new_feature = features_prev_lang[k-1]  if len(features_prev_lang)>=k else features_prev_lang[len(features_prev_lang)-1]

            k = k+1
        feature_map.append(new_feature)
    
    #flatten the feature_map
    feature_map = list(itertools.chain(*feature_map))
    print(type(feature_map))
    print(feature_map)
    
    #fit documents into the new feature map
    count_vectorizer = get_ngram_word_feature_extractor(1, 1, None, feature_map)
    x = count_vectorizer.fit_transform(data_frame["content"].values)
    y = data_frame["label"].values
    predicted_y = [category_labels[predict_class(row)] for row in x]
    
    #calculate the accuracies
    acc_score = accuracy_score(y, predicted_y)
    score = f1_score(y, predicted_y, labels=category_labels, average='micro')
    print('Total tweets classified:', len(data_frame))
    print('Accuracy Score:', acc_score)
    print('F1 Score:', score)

#returns the top k words from the corpus
def get_topk_freq_words(corpus, k):
    count_vectorizer = get_ngram_word_feature_extractor(1,1)
    analyzer = count_vectorizer.build_analyzer()
    listNgramQuery = analyzer(corpus)
    fdist = FreqDist(listNgramQuery)
    top_k = fdist.most_common(k)
    features = [x[0] for x in top_k]
    return features

#get the k-th most common word
#used when 2 different languages have the same most fequent word
def get_kth_freq_word(corpus, k):
    count_vectorizer = get_ngram_word_feature_extractor(1,1)
    analyzer = count_vectorizer.build_analyzer()
    listNgramQuery = analyzer(corpus)
    fdist = FreqDist(listNgramQuery)
    top_k = fdist.most_common(k)
    print("in kth freq word")
    return top_k[k-1][0] if len(top_k)==k else top_k[len(top_k)-1]

#returns the class of 
def predict_class(x):
    return np.argmax(x,axis=1)