# 0. Importing Basic Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
import csv
import jieba.analyse
from bs4 import BeautifulSoup
from collections import defaultdict

# 1. Preprocessing
## 1.1 Load Training Data

In [3]:
with open('offsite-test-material/offsite-tagging-training-set.csv', 'r', encoding='utf8') as f:
    file_reader = csv.reader(f, delimiter=',', quotechar='"')
    next(file_reader)
    training_rows = [(int(_[0]),_[1], _[2]) for _ in file_reader] #id, class, text

In [4]:
labels = list({_[1] for _ in training_rows}) #unique list of all applied labels
labeldicts_short = {_: (defaultdict(float), 0) for _ in labels} #dictionary for occurrence of short tokens in each classified doc
labeldicts_long = {_: (defaultdict(float), 0) for _ in labels} #dictionary for occurrence of long tokens in each classified doc
docdict_short = defaultdict(float), 0 # dictionary for occurrence of short tokens in the whole document
docdict_long = defaultdict(float), 0# dictionary for occurrence of long tokens in the whole document


## 1.2 Selecting most relevant tokens
I am building a TD-IDF-esque model, for which I will select the most 'relevant' tokens as features. Relevance here is defined as the highest ratio of frequency in the relevant 'term' over the frequency in the overall 'document'. A 'term' here is the union of all segments that belong to a single categroy. The document is the union of all segments

In [5]:
# Counting token frequency
training_clean = list()
for training_row in training_rows:
    soup = BeautifulSoup(training_row[2], 'html5lib') #remove HTML tokens
    text_only = soup.get_text() 
    short_tokens = jieba.cut(text_only, cut_all=True)
    for token in short_tokens:
        labeldicts_short[training_row[1]][0][token] += 1
        docdict_short[0][token] += 1
        
    long_tokens = jieba.cut(text_only, cut_all=False)
    for token in long_tokens:
        labeldicts_long[training_row[1]][0][token] += 1
        docdict_long[0][token] += 1
        
    training_clean.append((text_only, training_row[1]))
        


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.052 seconds.
Prefix dict has been built succesfully.


In [6]:
# Calculating term/document length
for label in labels:
    labeldicts_long[label] = labeldicts_long[label][0], sum(labeldicts_long[label][0].values())
    labeldicts_short[label] = labeldicts_short[label][0], sum(labeldicts_short[label][0].values())
    
docdict_long = docdict_long[0], sum(docdict_long[0].values())
docdict_short = docdict_short[0], sum(docdict_short[0].values())


In [7]:
# helper function that returns the highest TFIDF of a token. 
# highly relevant tokens will have maximum TDIDFs of 2-3,  
# they exclusively occur in fragments of one class, but the size of the term length differs
# irrelevant tokens will have uniform TFIDFs of 1 (they occur everywhere with the same frequencyy)
def relative_frequency(token, classdicts, docdict, docdict_total=None):
    occurrences = [(classdict[0][token], classdict[1]) for classdict in classdicts if token in classdict[0]]
    if occurrences:
        max_occurence, term_length = max(occurrences, key=lambda _: _[0]/_[1])
        total_occurrence, doc_length  = docdict[0][token], docdict[1]
        tf = (max_occurence/term_length)
        df = (total_occurrence/doc_length)
        return (tf/df, max_occurence, total_occurrence)
    else:
        return 0, 0, docdict.get(token, 0)

relative_frequency('重賽', labeldicts_long.values(), docdict_long)

(2.0981545991471573, 29.0, 29.0)

In [8]:
relevance_cutoff = 1.8
occurrent_cutoff = 50

short_classdicts = labeldicts_short.values()
long_classdicts = labeldicts_long.values()
maxfreq_short = {key: relative_frequency(key, short_classdicts, docdict_short) for key in docdict_short[0].keys()}
maxfreq_long = {key: relative_frequency(key, long_classdicts, docdict_long) for key in docdict_long[0].keys()}

In [9]:
relevant_tokens_short_set = {key for key, value in maxfreq_short.items() if (value[0] > relevance_cutoff and
                                                                             value[2] > occurrent_cutoff)}

relevant_tokens_long_set = {key for key, value in maxfreq_long.items() if (value[0] > relevance_cutoff and
                                                                           value[2] > occurrent_cutoff)}

relevant_tokens_short_indices = {token: index for index, token in enumerate(relevant_tokens_short_set)}
short_vector_dimension = len(relevant_tokens_short_indices)

relevant_tokens_long_indices = {token: index for index, token in enumerate(relevant_tokens_long_set)}
long_vector_dimension = len(relevant_tokens_long_indices)
print('Token one-hot vector space dimension: {} for short tokens, {} for long tokens'
      .format(short_vector_dimension, long_vector_dimension))

label_indices = {key: index for index, key in enumerate(labels)}

Token one-hot vector space dimension: 1467 for short tokens, 2091 for long tokens


In [10]:
def tokens_to_vector(tokens, vector_dict, vector_dimesion):
    vector = np.zeros(vector_dimesion)
    for token in tokens:
        if token in vector_dict:
            vector[vector_dict[token]] += 1
    return vector

def sentence_to_vector(sentence, vector_dict, vector_dimesion, cut_all=False):
    tokens = jieba.cut(sentence, cut_all=cut_all)
    return tokens_to_vector(tokens, vector_dict, vector_dimesion)

def label_to_onehot(label, label_indices, label_count):
    vector = np.zeros(label_count)
    if label in label_indices:
        vector[label_indices[label]] = 1
    return vector


# 2. Creating Training Data

In [13]:
training_df = pd.read_csv('offsite-test-material/offsite-tagging-training-set.csv', encoding='utf8')
training_df.index = training_df['id']
training_df.head()

Unnamed: 0_level_0,id,tags,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3443,3443,足球,利物浦重賽擊敗乙組仔　英足盃過關 英格蘭足總盃第三圈今晨重賽，貴為英超勁旅的利物浦上場被乙組...
76056,76056,足球,【中超】恒大「暴力戰」絕殺國安　楊智反重力插水惹爭議（有片） 中超首輪賽事重頭戲，廣州恒大主...
93405,93405,足球,【歐霸決賽】阿積士控球率起腳佔優　隊長卡拉臣輸波不服氣 阿積士以歐洲主要決賽最年輕、平均22...
26767,26767,足球,【歐國盃】韋莫斯澄清更衣室未內訌　盼以團結力量挫愛爾蘭 今晚3場直播\r\r\nE組｜比利時...
20843,20843,梁振英,王維基參選　點解？ 王維基在宣布有意出選的記者會上，打出ABC，Anyone But CY的...


In [16]:
def row_to_vec(row, vector_dict, vector_dimension):
    return sentence_to_vector(sentence=row['text'], vector_dict=vector_dict, vector_dimesion=vector_dimension)

In [19]:
training_num_df = training_df.apply(lambda _: row_to_vec(_, relevant_tokens_long_indices, long_vector_dimension), axis=1)
training_num_df.head()

ValueError: Shape of passed values is (3894, 2091), indices imply (3894, 3)

In [None]:
training_data = []
training_labels = []
for training_row in training_clean:
    training_vector = sentence_to_vector(training_row[0], relevant_tokens_long_indices, long_vector_dimension)
    training_data.append(training_vector)
    #training_labels.append(label_to_onehot(training_row[1], label_indices, len(label_indices)))
    training_labels.append(label_indices[training_row[1]])
    if sum(training_vector) == 0:
        print(training_row)

In [None]:
np.array(training_data[:2])

In [None]:
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search


In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(training_data[10:], training_labels[10:])

In [None]:
gbc.predict(training_data[:10])

In [None]:
np.array(training_labels[:10])

In [None]:
training_data = np.array(training_data)
import random
def random_train_test_split(df, test_share):
    test_indices = random.sample(list(df.index), int(len(df)*test_share))
    train_indices = [_ for _ in df.index if _ not in test_indices]
    return df.loc[train_indices], df.loc[test_indices]
train_set, test_set = random_train_test_split(pd.DataFrame(training_data), .2)

In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(training_data[10:], training_labels[10:])


In [None]:
list(np.concatenate(training_data, np.array(training_labels)))