In [None]:
import json, os, csv
import pandas as pd
import numpy as np
import nltk, itertools
from nltk.probability import FreqDist
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import KFold
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

'''
all constants
'''

data_dir_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'all_data'))
input_fname = "processed_data.json"
category_fname = "uniformly_sampled.tsv"

#category labels according to the website
'''
category_labels = ['am', 'ar', 'bg', 'bn', 'bo', 'bs', 'ca', 'ckb', 
                   'cs', 'cy', 'da', 'de', 'dv', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 
                   'gu', 'he', 'hi', 'hi-Latn', 'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 
                   'ka', 'km', 'kn', 'ko', 'lo', 'lt', 'lv', 'ml', 'mr', 'ms', 'my', 'ne', 
                   'nl', 'no', 'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'sd', 'si', 'sk', 'sl', 
                   'sr', 'sv', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'zh-CN', 'zh-TW']
'''
category_labels = []
#constants related to n-grams
min_ngram_value = 2
max_ngram_value = 6
k = 5                  #store only top k most frequent n grams for future

#constants for testing
n_folds = 6

In [None]:
'''
Step 1: Load the processed data
Creates the DataFrame along with the Category labels
'''
#create category labels from the category_fname file. 
#Returns the mapping between category-to-id in processed_tweets.json
def create_category_labels(category_fname, data_dir_path):
    extra_cat_added = []
    category_fname_path = os.path.abspath(os.path.join(data_dir_path, category_fname))
    #create the id-to-category map
    id_category_map={}
    with open(category_fname_path,'rb') as tsvfile:
        tsvin = csv.reader(tsvfile, delimiter='\t')
        for row in tsvin:
            if row[0] not in category_labels:
                category_labels.append(row[0])
                extra_cat_added.append(row[0])
            id_category_map[row[1]] = row[0]
    
    #print category statistics
    #print("These additional categories were found in the dataset:")
    #print(extra_cat_added)  
    print("Total number of categories now is:",len(category_labels))
    return id_category_map 

#returns a dict to store the corpus for every language <lang_code, entire_text_corpus>
#used for BASELINE only
def create_corpus_for_languages(data_frame):
    language_corpus_map={}
    for index, row in data_frame.iterrows():
        if row["class"] in language_corpus_map:
            language_corpus_map[row["class"]] = language_corpus_map[row["class"]] + " " +row["content"]
        else:
             language_corpus_map[row["class"]] = row["content"]
    return language_corpus_map
        
    
def build_data_frame(input_fname, data_dir_path, id_category_map):
    input_fname_path = os.path.abspath(os.path.join(data_dir_path, input_fname))
    rows = []
    index = []
    with open(input_fname_path,'rb') as data_file:
        data = json.load(data_file)
        i=0
        for item in data:
            rows.append({'content': item["content"], 'class': id_category_map[item["id"]]})
            index.append(i)
            i=i+1

    data_frame = pd.DataFrame(rows,index=index)
    return data_frame

def load_data():
    id_category_map = create_category_labels(category_fname, data_dir_path)
    data_frame = build_data_frame(input_fname, data_dir_path, id_category_map)
    return data_frame

In [None]:
'''
Step 2: Extract features
'''
def extract_features(feature_extractor, data):
    counts = feature_extractor.fit_transform(data)
    return counts

#extract character n-grams from the data
def get_ngram_character_feature_extractor():
    count_vectorizer = CountVectorizer(ngram_range=(min_ngram_value,max_ngram_value),analyzer='char')
    return count_vectorizer

def get_ngram_word_feature_extractor(min_ngram=min_ngram_value, max_ngram=max_ngram_value, 
                                     max_features=None, vocabulary=None):
    count_vectorizer = CountVectorizer(ngram_range=(min_ngram,max_ngram),analyzer='word', 
                                       max_features=max_features, vocabulary=vocabulary)
    return count_vectorizer

#TODO: extract top-k character n-grams from data


In [None]:
'''
Step 3: Baseline
'''
#BASELINE 1: classify the language if it contains the most frequent word for a language
#If it contains words from multiple languages, 
#then use a tie breaking mechanism to classify it into that language for which 
#it has the highest frequency of the most freq word
def baseline_1(data_frame, language_corpus_map):
    feature_map = [get_most_freq_words(language_corpus_map[key], 1) for key in language_corpus_map]
    
    #flatten the feature_map
    feature_map = list(itertools.chain(*feature_map))
    print(type(feature_map))
    print(feature_map)
    
    #fit documents into the new feature map
    count_vectorizer = get_ngram_word_feature_extractor(1, 1, None, feature_map)
    x = count_vectorizer.fit_transform(data_frame["content"].values)
    y = data_frame["class"].values
    predicted_y = [category_labels[predict_class(row)] for row in x]
    
    #calculate the accuracies
    acc_score = accuracy_score(y, predicted_y)
    score = f1_score(y, predicted_y, labels=category_labels, average='micro')
    print('Total tweets classified:', len(data_frame))
    print('Accuracy Score:', acc_score)
    print('F1 Score:', score)

#returns the top k words from the corpus
def get_most_freq_words(corpus, k):
    count_vectorizer = get_ngram_word_feature_extractor(1,1)
    analyzer = count_vectorizer.build_analyzer()
    listNgramQuery = analyzer(corpus)
    fdist = FreqDist(listNgramQuery)
    top_k = fdist.most_common(k)
    features = [x[0] for x in top_k]
    return features

#get the k-th most common word
#used when 2 different languages have the same most fequent word

#returns the class of 
def predict_class(x):
    return np.argmax(x,axis=1)

In [None]:
'''
Step 4: Classifier
Set up the classification algorithm 
'''
def get_bayes_classifier():
    return MultinomialNB()

def get_logistic_regression_classifier():
    return LogisticRegression()

def get_baseline_classifier(data):
    return null
    #for instance in data:
        

In [None]:
'''
Step 5: Training
Run the classification algorithm  
'''
def train_data(pipeline, data):

    #set x and y 
    x = data['content'].values
    y = data['class'].values

    pipeline.fit(x, y)
    return pipeline

print("using NBC classifier")
data = load_data()
test_x = ['hey test document']
#set up the pipeline
pipeline = Pipeline([
        ('vectorizer',  get_ngram_character_feature_extractor()),
        ('classifier',  get_bayes_classifier()) ])
pipeline = train_data(pipeline, data)
test_y = pipeline.predict(test_x)
print("the predicted class is:",test_y[0])

In [None]:
'''
Step 6: Testing
'''
#old school testing
def test_data(pipeline, data):
    x = data['content'].values
    y = data['class'].values
    train_x, test_x, train_y, test_x = train_test_split(x, y, test_size=0.33, random_state=42)
    pipeline.fit(train_x, train_y)
    predictions = pipeline.predict(test_x)
    acc_score = accuracy_score(test_y, predictions)
    score = f1_score(test_y, predictions, labels=category_labels, average='micro')
    print('Total tweets classified:', len(data))
    print('Accuracy Score:', acc_score)
    print('F1 Score:', score)
    
#k-fold testing
def do_k_fold_testing(pipeline, data):
    k_fold = KFold(n=len(data), n_folds=n_folds)
    scores = []
    f1_scores = []
    for train_indices, test_indices in k_fold:
        train_x = data.iloc[train_indices]['content'].values
        train_y = data.iloc[train_indices]['class'].values

        test_x = data.iloc[test_indices]['content'].values
        test_y = data.iloc[test_indices]['class'].values

        pipeline.fit(train_x, train_y)
        predictions = pipeline.predict(test_x)

        acc_score = accuracy_score(test_y, predictions)
        scores.append(acc_score)

        score = f1_score(test_y, predictions, labels=category_labels, average='micro')
        f1_scores.append(score)

    #print statistics
    print('Total tweets classified:', len(data))
    print('Accuracy Score:', sum(scores)/len(scores))
    print('F1 Score:', sum(f1_scores)/len(f1_scores))

In [None]:
'''
Put it all together!
'''
# for bayes classifier
data = load_data()
pipeline = Pipeline([
    ('vectorizer',  get_ngram_character_feature_extractor()),
    ('classifier',  get_bayes_classifier()) ])
do_k_fold_testing(pipeline, data)

In [None]:
#for logistic regression
data = load_data()
pipeline2 = Pipeline([
    ('vectorizer',  get_ngram_character_feature_extractor()),
    ('classifier',  get_logistic_regression_classifier()) ])
test_data(pipeline2, data)

In [None]:
data_frame = load_data()
language_corpus_map = create_corpus_for_languages(data_frame)
baseline_1(data_frame, language_corpus_map)