In [None]:
import json, os, csv
import pandas as pd
import numpy as np
import nltk, itertools
from nltk.probability import FreqDist
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import KFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

#imports for plotting confusion matrix
import matplotlib.pyplot as plt
import seaborn as sn

'''
all constants
'''

data_dir_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'all_data'))
input_fname = "processed_data.json"
category_fname = "uniformly_sampled.tsv"
confusion_matrix_fname = 'confusion_matrix.png'
#category labels according to the website
'''
category_labels = ['am', 'ar', 'bg', 'bn', 'bo', 'bs', 'ca', 'ckb', 
                   'cs', 'cy', 'da', 'de', 'dv', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 
                   'gu', 'he', 'hi', 'hi-Latn', 'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 
                   'ka', 'km', 'kn', 'ko', 'lo', 'lt', 'lv', 'ml', 'mr', 'ms', 'my', 'ne', 
                   'nl', 'no', 'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'sd', 'si', 'sk', 'sl', 
                   'sr', 'sv', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'zh-CN', 'zh-TW']
'''
category_labels = []

#constants related to n-grams     BEST RESULST: 3-4
min_ngram_value = 3
max_ngram_value = 4
k = 1000                  #store only top k most frequent n grams 

#constants for testing
n_folds = 6

In [None]:
'''
Step 1: Load the processed data
Creates the DataFrame along with the Category labels
'''
def build_data_frame(input_fname, data_dir_path):
    input_fname_path = os.path.abspath(os.path.join(data_dir_path, input_fname))
    rows = []
    index = []
    with open(input_fname_path,'rb') as data_file:
        data = json.load(data_file)
        for item in data:
            #compute the unique labels for the dataset
            if item["label"] not in category_labels:
                category_labels.append(item["label"])
            #append the entire tuple to the row
            rows.append({'content': item["content"], 'label': item["label"]})
            index.append(item["id"])

    data_frame = pd.DataFrame(rows,index=index)
    return data_frame

def load_data():
    data_frame = build_data_frame(input_fname, data_dir_path)
    return data_frame

In [None]:
'''
Step 2: Extract features
'''
def extract_features(feature_extractor, data):
    counts = feature_extractor.fit_transform(data)
    return counts

#extract character n-grams from the data
def get_ngram_character_feature_extractor(min_ngram=min_ngram_value, max_ngram=max_ngram_value):
    print("min:",min_ngram,",max:",max_ngram)
    count_vectorizer = CountVectorizer(ngram_range=(min_ngram, max_ngram),analyzer='char')
    return count_vectorizer

def get_ngram_word_feature_extractor(min_ngram=min_ngram_value, max_ngram=max_ngram_value, 
                                     max_features=None, vocabulary=None):
    count_vectorizer = CountVectorizer(ngram_range=(min_ngram, max_ngram),analyzer='word', 
                                       max_features=max_features, vocabulary=vocabulary)
    return count_vectorizer

#TODO: extract top-k character n-grams from data
def get_topk_ngram_char_feature_extractor(min_ngram=min_ngram_value, max_ngram=max_ngram_value, 
                                     k=None, vocabulary=None):
    print(min_ngram,",",max_ngram)
    count_vectorizer = CountVectorizer(ngram_range=(min_ngram, max_ngram),analyzer='char', 
                                       max_features=k, vocabulary=vocabulary)
    return count_vectorizer

In [None]:
'''
Step 4: Classifier
Set up the classification algorithm 
'''
def get_bayes_classifier():
    return MultinomialNB()

def get_logistic_regression_classifier():
    return LogisticRegression()

In [None]:
'''
Step 5: Training
Run the classification algorithm  
'''
def train_data(pipeline, data):

    #set x and y 
    x = data['content'].values
    y = data['label'].values

    pipeline.fit(x, y)
    return pipeline

In [None]:
'''
Step 6: Testing
'''
#old school testing
def test_data(pipeline, data):
    x = data['content'].values
    y = data['label'].values
    train_x, test_x, train_y, test_x = train_test_split(x, y, test_size=0.33, random_state=42)
    pipeline.fit(train_x, train_y)
    predictions = pipeline.predict(test_x)
    acc_score = accuracy_score(test_y, predictions)
    score = f1_score(test_y, predictions, labels=category_labels, average='micro')
    print('Total tweets classified:', len(data))
    print('Accuracy Score:', acc_score)
    print('F1 Score:', score)
    
#k-fold testing
def do_k_fold_testing(pipeline, data, n_folds=n_folds, plot_confusion=False):
    k_fold = KFold(n=len(data), n_folds=n_folds)
    scores = []
    f1_scores = []
    if plot_confusion:
        confusion = np.zeros((len(category_labels),len(category_labels)))
    for train_indices, test_indices in k_fold:
        train_x = data.iloc[train_indices]['content'].values
        train_y = data.iloc[train_indices]['label'].values

        test_x = data.iloc[test_indices]['content'].values
        test_y = data.iloc[test_indices]['label'].values

        pipeline.fit(train_x, train_y)
        predictions = pipeline.predict(test_x)

        acc_score = accuracy_score(test_y, predictions)
        scores.append(acc_score)

        score = f1_score(test_y, predictions, labels=category_labels, average='micro')
        f1_scores.append(score)
        if plot_confusion:
            confusion += confusion_matrix(test_y, predictions,labels=category_labels)

    #print statistics
    print('Total tweets classified:', len(data))
    print('Accuracy Score:', sum(scores)/len(scores))
    print('F1 Score:', sum(f1_scores)/len(f1_scores))
    
    if plot_confusion:
        plot_confusion_matrix(confusion, classes=category_labels)

In [None]:
'''
Helper section
'''
#Plotting the confusion matrix
def plot_confusion_matrix(cm, classes):
    df_cm = pd.DataFrame(cm, index = classes,
                  columns = classes)
    cm_size = int(len(classes)/2)
    plt.figure(figsize = (cm_size,cm_size))
    sn.heatmap(df_cm, annot=True)
    plt.savefig(confusion_matrix_fname)

In [None]:
'''
Put it all together!
'''
# for bayes classifier
def test_bayes_classifier():
    data = load_data()
    pipeline = Pipeline([
        ('vectorizer',  get_ngram_character_feature_extractor()),
        ('classifier',  get_bayes_classifier()) ])
    do_k_fold_testing(pipeline, data)

#for logistic regression
def test_lr_classifier():
    data = load_data()
    pipeline = Pipeline([
        ('vectorizer',  get_ngram_character_feature_extractor()),
        ('classifier',  get_logistic_regression_classifier()) ])
    test_data(pipeline2, data)

In [None]:
test_bayes_classifier()
test_lr_classifier()