# 0. Importing Basic Libraries

In [None]:
import numpy as np
import pandas as pd

In [None]:
import csv
import re
import jieba.analyse
import random
from bs4 import BeautifulSoup
from collections import defaultdict

# 1. Preprocessing
## 1.1 Loading Training Data (For Input Vectors)

In [3]:
training_df = pd.read_csv('offsite-test-material/offsite-tagging-training-set.csv', encoding='utf8')
training_df.index = training_df['id']
training_df.head()

In [4]:
def remove_html(text):
    soup = BeautifulSoup(text, 'html5lib') #remove HTML tokens
    text_only = soup.get_text() 
    text_normal_whitespace = re.sub("\s\s+", " ", text_only)
    return text_only

In [5]:
training_df['text_clean'] = training_df.apply(lambda _: remove_html(_['text']), axis=1)
training_df['clean_length'] = training_df.apply(lambda _: len(_['text_clean']), axis=1)
training_df.head()

Unnamed: 0_level_0,id,tags,text,text_clean,clean_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3443,3443,足球,利物浦重賽擊敗乙組仔　英足盃過關 英格蘭足總盃第三圈今晨重賽，貴為英超勁旅的利物浦上場被乙組...,利物浦重賽擊敗乙組仔　英足盃過關 英格蘭足總盃第三圈今晨重賽，貴為英超勁旅的利物浦上場被乙組...,369
76056,76056,足球,【中超】恒大「暴力戰」絕殺國安　楊智反重力插水惹爭議（有片） 中超首輪賽事重頭戲，廣州恒大主...,【中超】恒大「暴力戰」絕殺國安　楊智反重力插水惹爭議（有片） 中超首輪賽事重頭戲，廣州恒大主...,637
93405,93405,足球,【歐霸決賽】阿積士控球率起腳佔優　隊長卡拉臣輸波不服氣 阿積士以歐洲主要決賽最年輕、平均22...,【歐霸決賽】阿積士控球率起腳佔優　隊長卡拉臣輸波不服氣 阿積士以歐洲主要決賽最年輕、平均22...,959
26767,26767,足球,【歐國盃】韋莫斯澄清更衣室未內訌　盼以團結力量挫愛爾蘭 今晚3場直播\r\r\nE組｜比利時...,【歐國盃】韋莫斯澄清更衣室未內訌　盼以團結力量挫愛爾蘭 今晚3場直播\n\nE組｜比利時Vs...,782
20843,20843,梁振英,王維基參選　點解？ 王維基在宣布有意出選的記者會上，打出ABC，Anyone But CY的...,王維基參選　點解？ 王維基在宣布有意出選的記者會上，打出ABC，Anyone But CY的...,1281


In [102]:
def process_group(row):
    return pd.Series(dict(char_cnt=row['clean_length'].sum(), record_cnt=row.clean_length.count()))
labels_df = pd.DataFrame(training_df.groupby(['tags']).apply(process_group))
labels_df['label_id'] = pd.Categorical(labels_df.index).codes
label_dict = {a: b.label_id for a, b in labels_df.iterrows()}
label_id_dict = {b.label_id: a for a, b in labels_df.iterrows()}
labels = list(label_dict.keys())
labels_df.head()


Unnamed: 0_level_0,char_cnt,record_cnt,label_id
tags,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
梁振英,887021,929,0
美國大選,993030,842,1
足球,1701491,2123,2


## 1.2 Loading Data (For Frequency Analysis)

In [7]:
text_dict = defaultdict(list)
with open('offsite-test-material/offsite-tagging-training-set.csv', 'r', encoding='utf8') as f:
    file_reader = csv.reader(f, delimiter=',', quotechar='"')
    next(file_reader)
    for row in file_reader:
        text_dict[row[1]].append(remove_html(row[2]))

fulltext_dict = {k: '\n'.join([_ for _ in v]) for k, v in text_dict.items()}

In [8]:
print('Found the following categories:\n{}'.format('\n'.join(['{}: {} fragments with {} characters'
                                                               .format(k, len(text_dict[k]), len(fulltext_dict[k]))
                                                               for k in text_dict.keys() ])))

Found the following categories:
足球: 2123 fragments with 1703613 characters
梁振英: 929 fragments with 887949 characters
美國大選: 842 fragments with 993871 characters


This means that we have twice the frequency of articles related to soccer than to either the outgoing CE or US elections. This is a bit tricky in terms of maximum TF-IDF

## 1.2 Selecting most relevant tokens
I am building a TD-IDF-esque model, for which I will select the most 'relevant' tokens as features. Relevance here is defined as the highest ratio of frequency in the relevant 'term' over the frequency in the overall 'document'. A 'term' here is the union of all segments that belong to a single categroy. The document is the union of all segments

In [9]:
#dictionary for occurrence of short tokens in each classified doc
labeldicts_short = {_: (defaultdict(float), 0) for _ in labels} 
#dictionary for occurrence of long tokens in each classified doc
labeldicts_long = {_: (defaultdict(float), 0) for _ in labels} 
# dictionary for occurrence of short tokens in the whole document
docdict_short = defaultdict(float), 0 
# dictionary for occurrence of long tokens in the whole document
docdict_long = defaultdict(float), 0


In [10]:
# Counting token frequency
training_clean = list()
for label, combined_text in fulltext_dict.items():
    short_tokens = jieba.cut(combined_text, cut_all=True)
    for token in short_tokens:
        labeldicts_short[label][0][token] += 1
        docdict_short[0][token] += 1
        
    long_tokens = jieba.cut(combined_text, cut_all=False)
    for token in long_tokens:
        labeldicts_long[label][0][token] += 1
        docdict_long[0][token] += 1
        
        


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.998 seconds.
Prefix dict has been built succesfully.


In [11]:
# Calculating term/document length
for label in labels:
    labeldicts_long[label] = labeldicts_long[label][0], sum(labeldicts_long[label][0].values())
    labeldicts_short[label] = labeldicts_short[label][0], sum(labeldicts_short[label][0].values())
    
docdict_long = docdict_long[0], sum(docdict_long[0].values())
docdict_short = docdict_short[0], sum(docdict_short[0].values())


In [12]:
# helper function that returns the highest TFIDF of a token. 
# highly relevant tokens will have maximum TDIDFs of 2-3,  
# they exclusively occur in fragments of one class, but the size of the term length differs
# irrelevant tokens will have uniform TFIDFs of 1 (they occur everywhere with the same frequencyy)
def relative_frequency(token, classdicts, docdict, docdict_total=None):
    occurrences = [(classdict[0][token], classdict[1]) for classdict in classdicts if token in classdict[0]]
    if occurrences:
        max_occurence, term_length = max(occurrences, key=lambda _: _[0]/_[1])
        total_occurrence, doc_length  = docdict[0][token], docdict[1]
        tf = (max_occurence/term_length)
        df = (total_occurrence/doc_length)
        return (tf/df, max_occurence, total_occurrence)
    else:
        print(token)
        return 0, 0, docdict[0].get(token, 0)

relative_frequency('重賽', labeldicts_long.values(), docdict_long)

(2.0976038838647515, 29.0, 29.0)

In [13]:
relevance_cutoff = 1.8
occurrent_cutoff = 50

short_classdicts = labeldicts_short.values()
long_classdicts = labeldicts_long.values()
maxfreq_short = {key: relative_frequency(key, short_classdicts, docdict_short) for key in docdict_short[0].keys()}
maxfreq_long = {key: relative_frequency(key, long_classdicts, docdict_long) for key in docdict_long[0].keys()}

In [14]:
relevant_tokens_short_list = sorted([key for key, value in maxfreq_short.items() 
                                     if (value[0] > relevance_cutoff and
                                         value[2] > occurrent_cutoff and
                                         key.isalpha())])

relevant_tokens_long_list = sorted([key for key, value in maxfreq_long.items() 
                                    if (value[0] > relevance_cutoff and
                                        value[2] > occurrent_cutoff and
                                        key.isalpha())])

## 1.3 Creating Training Data

In [23]:
def sentence_to_vector(sentence, tokenlist, cut_all=False):
    a = defaultdict(int)
    tokens = jieba.cut(sentence, cut_all=cut_all)
    for token in tokens:
        a[token] += 1
    out_dict = {_: a.get(_, 0) for _ in tokenlist}
    return pd.Series(out_dict)

occ_input_long = pd.DataFrame(training_df.text_clean.apply(
    lambda _: sentence_to_vector(_, relevant_tokens_long_list)))

occ_input_short = pd.DataFrame(training_df.text_clean.apply(
    lambda _: sentence_to_vector(_, relevant_tokens_short_list, cut_all=True)))

data_target = pd.DataFrame(training_df.merge(labels_df, how='inner', left_on='tags', right_index=True)['label_id'])    

In [85]:
test_share = 0.2
indices = list(occ_input_long.index)
test_indices = random.sample(indices, int(len(indices)*test_share))
train_indices = [_ for _ in indices if _ not in test_indices]
training_data_long = occ_input_long.loc[train_indices]
training_data_short = occ_input_short.loc[train_indices]
holdout_data_long = occ_input_long.loc[test_indices]
holdout_data_short = occ_input_short.loc[test_indices]
training_target = data_target.loc[train_indices]
holdout_target = data_target.loc[test_indices]


# 2. Training Models
## 2.1 Standard RandomForest Classifier

In [86]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [87]:
RFC = RandomForestClassifier()
RFC.fit(np.asarray(training_data_short), np.asarray(training_target).ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [88]:
accuracy_score(RFC.predict(np.asarray(holdout_data_short)), holdout_target)

0.98971722365038561

## 2.2 Gradient Boosted Classifier (Standard SkLearn)

In [89]:
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm


In [90]:
gbc = GradientBoostingClassifier()
gbc.fit(np.asarray(training_data_short), np.asarray(training_target).ravel())

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [91]:
accuracy_score(gbc.predict(np.asarray(holdout_data_short)), holdout_target)

0.98971722365038561

In [46]:
test=pd.DataFrame([[0,0,0,1,1],[0,0,0,1,2]])

In [63]:
from sklearn import svm
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

In [103]:
svc = svm.SVC(kernel=cosine_similarity)
svc.fit(training_data_long, training_target)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto',
  kernel=<function cosine_similarity at 0x12f1779d8>, max_iter=-1,
  probability=False, random_state=None, shrinking=True, tol=0.001,
  verbose=False)

In [105]:
results.loc[25131].label_id

1

In [107]:
prediction = svc.predict(np.asarray(holdout_data_long))
acs = accuracy_score(prediction, (holdout_target))
results = holdout_target.copy()
results['prediction'] = prediction
misclassified_ids = list(results[results['prediction']!=results['label_id']].index)
for misclassified_id in misclassified_ids:
    print('The following text was classified as {}, whereas it should be {}. \n{}'
          .format(label_id_dict[results.loc[misclassified_id].prediction],
                  label_id_dict[results.loc[misclassified_id].label_id],
                  training_df.loc[misclassified_id].text_clean))


The following text was classified as 梁振英, whereas it should be 美國大選. 
【早晨精讀】施嘉莉襲港　港姐面試鬥「靚」　端午有活雞　 施嘉莉現身油麻地拍《攻殼》荷李活女星施嘉莉祖安遜（Scarlett Johansson） 昨日忽然現身香港拍攝新片《攻殼機動隊》（Ghost in the Shell），《香港01》更拍得獨家照片。其後施嘉莉在接近30高溫度穿厚重大褸拍攝 施嘉莉現身油麻地拍《攻殼》 領展檢討外判制 端午恢復活雞供應
The following text was classified as 足球, whereas it should be 梁振英. 
趙世光靈堂白玫瑰布置　何琍琍說丈夫遺願：香港成為世界航運中心 船王趙從衍之子趙世光於香港殯儀館設靈，現場有過百花牌，禮堂以白玫瑰布置，外面有十多名穿黑色西裝的工作人員，有時見到記者拍攝會上前出手阻止。



下午約三時半，趙世光遺孀何琍琍一臉憔悴現身殯儀館，何琍琍表示現在心情已經好好多，多謝大家關心， 船王趙從衍之子趙世光於香港殯儀館設靈，現場有過百花牌，禮堂以白玫瑰布置，外面有十多名穿黑色西裝的工作人員，有時見到記者拍攝會上前出手阻止。



下午約三時半，趙世光遺孀何琍琍一臉憔悴現身殯儀館，何琍琍表示現在心情已經好好多，多謝大家關心，又透露喪禮會以天主教儀式進行，問到趙先生可有遺願？她說：「希望香港成為世界航運中心。」果然不愧是船王之子。
The following text was classified as 足球, whereas it should be 美國大選. 
《紐時》華裔編輯街頭受辱　潑婦大罵︰滾回中國！ 羅麥可的公開信是寫給那個出言不遜的婦人，他在文中寫道︰「我們當時剛剛離開教堂，我和我的家庭以及一些朋友正在曼克頓的上東區。那時我們去吃午餐，嘗試看看街尾一間韓國餐廳有沒有空位。你正在趕路，那時正在下雨，我們的嬰兒車以及一群嘈吵的亞洲人擋住你 羅麥可的公開信是寫給那個出言不遜的婦人，他在文中寫道︰「我們當時剛剛離開教堂，我和我的家庭以及一些朋友正在曼克頓的上東區。那時我們去吃午餐，嘗試看看街尾一間韓國餐廳有沒有空位。你正在趕路，那時正在下雨，我們的嬰兒車以及一群嘈吵的亞洲人擋住你的去路。但坦白說，當你從不遠處向我們大喝︰『滾

In [99]:
list(results[results['prediction']!=results['label_id']].index)

[25131, 7577, 2079, 46523]

In [70]:
for input, prediction, label in zip(holdout_data_long, svc.predict(np.asarray(holdout_data_long)), holdout_target):
    if prediction != label:
        print(input, 'has been classified as ', prediction, 'and should be ', label) 

ABC has been classified as  2 and should be  label_id


In [81]:
[holdout_target['label_id']==holdout_target['prediction']]

[id
 6000     True
 26719    True
 19573    True
 53668    True
 6406     True
 12543    True
 62329    True
 66744    True
 9917     True
 20527    True
 68923    True
 77436    True
 16598    True
 19781    True
 91115    True
 88354    True
 68969    True
 44290    True
 37767    True
 2326     True
 5080     True
 79183    True
 5420     True
 53153    True
 11613    True
 51738    True
 63512    True
 43580    True
 18526    True
 59150    True
          ... 
 67307    True
 50617    True
 60829    True
 66309    True
 86261    True
 3008     True
 79780    True
 61992    True
 33554    True
 49605    True
 24839    True
 18328    True
 11198    True
 21817    True
 68011    True
 46754    True
 56989    True
 90441    True
 3324     True
 19436    True
 9794     True
 50244    True
 73708    True
 15497    True
 51957    True
 80969    True
 30346    True
 73633    True
 6590     True
 48924    True
 Length: 778, dtype: bool]