In [28]:
import os, json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from collections import Counter
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

'''
all constants
'''

data_dir_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'all_data'))
input_fname = "processed_data.json"
k=10
category_labels = []

In [2]:
'''
Step 1: Load the processed data
Creates the DataFrame along with the Category labels
'''
def build_data_frame(input_fname, data_dir_path):
    input_fname_path = os.path.abspath(os.path.join(data_dir_path, input_fname))
    rows = []
    index = []
    with open(input_fname_path,'rb') as data_file:
        data = json.load(data_file)
        for item in data:
            #compute the unique labels for the dataset
            if item["label"] not in category_labels:
                category_labels.append(item["label"])
            #append the entire tuple to the row
            rows.append({'content': item["content"], 'label': item["label"]})
            index.append(item["id"])

    data_frame = pd.DataFrame(rows,index=index)
    return data_frame

def load_data():
    data_frame = build_data_frame(input_fname, data_dir_path)
    return data_frame

In [7]:
'''
Step 2: Extract features for training
'''
def get_ngram_word_feature_extractor(min_ngram=1, max_ngram=1, 
                                     max_features=None, 
                                     vocabulary=None, 
                                     binary=True):
    count_vectorizer = CountVectorizer(ngram_range = (min_ngram, max_ngram), analyzer = 'word', 
                                       max_features = max_features, binary = binary, vocabulary = vocabulary)
    return count_vectorizer

#returns a dict to store the corpus for every language <lang_code, [tweet1, tweet2,.. tweet_n]>
def create_corpus_for_languages(data_frame):
    language_corpus_map={}
    for index, row in data_frame.iterrows():
        if row["label"] not in language_corpus_map:
            language_corpus_map[row["label"]] = []
        language_corpus_map[row["label"]].append(row["content"]) 
    return language_corpus_map

#returns the top k most frequent data for langauge_data where the text is stored as a list
def find_topK_words_per_langauge(language_data, k=k):
    vec = get_ngram_word_feature_extractor()
    X = vec.fit_transform(language_data)
    counts = zip(vec.get_feature_names(),np.asarray(X.sum(axis=0)).ravel())
    most_frequent = sorted(counts, key=lambda tup: tup[1], reverse=True)[:k]
    return most_frequent

def train_data(data_frame):
    language_corpus_map = create_corpus_for_languages(data_frame)
    lang_mostFreqWord_map = {}
    for language in language_corpus_map:
        lang_mostFreqWord_map[language] = find_topK_words_per_langauge(language_corpus_map[language])
    return lang_mostFreqWord_map

In [38]:
'''
Step 3: Test the data
'''
#returns the predicted language of the tweet
def test_single_tweet(tweet, lang_mostFreqWord_map):
    string_label = []
    for word in tweet.split():
        for symbol in lang_mostFreqWord_map:
            for i in lang_mostFreqWord_map[symbol]:
                if i[0] == word:
                    string_label.append(symbol)
    if len(string_label) == 0:
        string_label.append('en')
    c = Counter(string_label)
    v=list(c.values())
    k=list(c.keys())
    return k[v.index(max(v))]

def test_data(data, lang_mostFreqWord_map):
    predicted = []
    for index, row in data.iterrows():
        predicted_class = test_single_tweet(row['content'], lang_mostFreqWord_map)
        predicted.append(predicted_class)
    return predicted

In [39]:
def calculate_metrics(true_y, predicted_y):
    print(accuracy_score(true_y, predicted_y))

In [41]:
def do_baseline():
    data_frame = load_data()
    
    #create train & test data from the original data
    train, test = train_test_split(data_frame, test_size = 0.2)
    lang_mostFreqWord_map = train_data(train)
    predicted = test_data(test, lang_mostFreqWord_map)
    calculate_metrics(test['label'].values, predicted)

do_baseline()

0.539420249265
