In [1]:
import json, os, csv
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import KFold
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

'''
all constants
'''

data_dir_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'all_data'))
input_fname = "processed_data.json"
category_fname = "uniformly_sampled.tsv"
category_labels = ['am', 'ar', 'bg', 'bn', 'bo', 'bs', 'ca', 'ckb', 
                   'cs', 'cy', 'da', 'de', 'dv', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 
                   'gu', 'he', 'hi', 'hi-Latn', 'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 
                   'ka', 'km', 'kn', 'ko', 'lo', 'lt', 'lv', 'ml', 'mr', 'ms', 'my', 'ne', 'nl', 'no', 
                   'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'sd', 'si', 'sk', 'sl', 'sr', 'sv', 'ta', 'te', 'th', 
                   'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'zh-CN', 'zh-TW']

#constants related to n-grams
min_ngram_value = 2
max_ngram_value = 6
k = 5                  #store only top k most frequent n grams for future

#constants for testing
n_folds = 6

In [2]:
'''
Step 1: Load the processed data
Creates the DataFrame along with the Category labels
'''
#create category labels from the category_fname file. 
#Returns the mapping between category-to-id in processed_tweets.json
def create_category_labels(category_fname, data_dir_path):
    extra_cat_added = []
    category_fname_path = os.path.abspath(os.path.join(data_dir_path, category_fname))
    #create the category -to-id mapping
    category_id_map={}
    with open(category_fname_path,'rb') as tsvfile:
        tsvin = csv.reader(tsvfile, delimiter='\t')
        for row in tsvin:
            if row[0] not in category_labels:
                category_labels.append(row[0])
                extra_cat_added.append(row[0])
            #category_id_map[row[1]] = category_labels.index(row[0])
            category_id_map[row[1]] = row[0]
    
    #print category statistics
    print("These additional categories were found in the dataset:")
    print(extra_cat_added)  
    print("Total number of categories now is:",len(category_labels))
    
    return category_id_map 
    
    
def build_data_frame(input_fname, data_dir_path, category_id_map):
    input_fname_path = os.path.abspath(os.path.join(data_dir_path, input_fname))
    rows = []
    index = []
    with open(input_fname_path,'rb') as data_file:
        data = json.load(data_file)
        i=0
        for item in data:
            rows.append({'content': item["content"], 'class': category_id_map[item["id"]]})
            index.append(i)
            i=i+1

    data_frame = pd.DataFrame(rows,index=index)
    return data_frame

def load_data():
    category_id_map = create_category_labels(category_fname, data_dir_path)
    data_frame = build_data_frame(input_fname, data_dir_path, category_id_map)
    return data_frame

In [3]:
'''
Step 2: Extract features
'''
def extract_features(feature_extractor, data):
    counts = feature_extractor.fit_transform(data)
    return counts

#extract character n-grams from the data
def get_ngram_character_feature_extractor():
    count_vectorizer = CountVectorizer(ngram_range=(min_ngram_value,max_ngram_value),analyzer='char')
    return count_vectorizer

def get_ngram_word_feature_extractor():
    count_vectorizer = CountVectorizer(ngram_range=(min_ngram_value,max_ngram_value),analyzer='word')
    return count_vectorizer

#TODO: extract top-k character n-grams from data

#TODO: extract most frequent words from each language -- BASELINE
    

In [4]:
'''
Step 3: Classifier
Set up the classification algorithm 
'''
def get_bayes_classifier():
    return MultinomialNB()

def get_logistic_regression_classifier():
    return LogisticRegression()

In [5]:
'''
Step 4: Training
Run the classification algorithm  
'''
def train_data(pipeline, data):

    #set x and y 
    x = data['content'].values
    y = data['class'].values

    pipeline.fit(x, y)
    return pipeline

print("using NBC classifier")
data = load_data()
test_x = ['hey test document']
#set up the pipeline
pipeline = Pipeline([
        ('vectorizer',  get_ngram_character_feature_extractor()),
        ('classifier',  get_bayes_classifier()) ])
pipeline = train_data(pipeline, data)
test_y = pipeline.predict(test_x)
print("the predicted class is:",test_y[0])

'''
print("using Logistic regression classifier")
#set up the pipeline
pipeline2 = Pipeline([
        ('vectorizer',  get_ngram_character_feature_extractor()),
        ('classifier',  get_logistic_regression_classifier()) ])
pipeline2 = train_data(pipeline2, data)
test_y = pipeline2.predict(test_x)
print("the predicted class is:",test_y[0])
'''

using NBC classifier
These additional categories were found in the dataset:
['ar_LATN', 'az', 'dv_LATN', 'gl', 'ha', 'ja_LATN', 'jv', 'ko_LATN', 'la', 'mk', 'ml_LATN', 'mn', 'mn_LATN', 'ps_LATN', 'sq', 'su', 'sw', 'ta_LATN', 'tn', 'und', 'ur_LATN', 'wo', 'xh', 'yo', 'zu']
('Total number of categories now is:', 95)
('the predicted class is:', 'en')


'\nprint("using Logistic regression classifier")\n#set up the pipeline\npipeline2 = Pipeline([\n        (\'vectorizer\',  get_ngram_character_feature_extractor()),\n        (\'classifier\',  get_logistic_regression_classifier()) ])\npipeline2 = train_data(pipeline2, data)\ntest_y = pipeline2.predict(test_x)\nprint("the predicted class is:",test_y[0])\n'

In [6]:
'''
Step 3: Testing
'''
#old school testing
def test_data(pipeline, data):
    x = data['content'].values
    y = data['class'].values
    train_x, test_x, train_y, test_x = train_test_split(x, y, test_size=0.33, random_state=42)
    pipeline.fit(train_x, train_y)
    predictions = pipeline.predict(test_x)
    acc_score = accuracy_score(test_y, predictions)
    score = f1_score(test_y, predictions, labels=category_labels, average='micro')
    print('Total tweets classified:', len(data))
    print('Accuracy Score:', acc_score)
    print('F1 Score:', score)
    
#k-fold testing
def do_k_fold_testing(pipeline, data):
    k_fold = KFold(n=len(data), n_folds=n_folds)
    scores = []
    f1_scores = []
    for train_indices, test_indices in k_fold:
        train_x = data.iloc[train_indices]['content'].values
        train_y = data.iloc[train_indices]['class'].values

        test_x = data.iloc[test_indices]['content'].values
        test_y = data.iloc[test_indices]['class'].values

        pipeline.fit(train_x, train_y)
        predictions = pipeline.predict(test_x)

        acc_score = accuracy_score(test_y, predictions)
        scores.append(acc_score)

        score = f1_score(test_y, predictions, labels=category_labels, average='micro')
        f1_scores.append(score)

    #print statistics
    print('Total tweets classified:', len(data))
    print('Accuracy Score:', sum(scores)/len(scores))
    print('F1 Score:', sum(f1_scores)/len(f1_scores))

In [8]:
'''
Put it all together!
'''
# for bayes classifier
data = load_data()
pipeline = Pipeline([
    ('vectorizer',  get_ngram_character_feature_extractor()),
    ('classifier',  get_bayes_classifier()) ])
do_k_fold_testing(pipeline, data)

These additional categories were found in the dataset:
[]
('Total number of categories now is:', 95)
('Total tweets classified:', 71444)
('Accuracy Score:', 0.62946597279214356)
('F1 Score:', 0.62946597279214356)


In [None]:
#for logistic regression
data = load_data()
pipeline2 = Pipeline([
    ('vectorizer',  get_ngram_character_feature_extractor()),
    ('classifier',  get_logistic_regression_classifier()) ])
test_data(pipeline2, data)

In [None]:
#TODO
from nltk.collocations import *
from nltk.probability import FreqDist
import nltk
'''
Step 3: Get the most frequent top k ngrams only!
'''
#get frequency distrubution of ngrams in the corpus
'''
analyzer = vectorizer.build_analyzer()
listNgramQuery = analyzer(corpus)
NgramQueryWeights = nltk.FreqDist(listNgramQuery)
print(NgramQueryWeights.most_common(k))
'''