# 0. Importing Basic Libraries

In [None]:
import numpy as np
import pandas as pd

In [None]:
import csv
import re
import jieba.analyse
import random
from bs4 import BeautifulSoup
from collections import defaultdict

# 1. Preprocessing
## 1.1 Loading Training Data (For Input Vectors)

In [None]:
training_df = pd.read_csv('offsite-test-material/offsite-tagging-training-set.csv', encoding='utf8')
training_df.index = training_df['id']
training_df.head()

In [None]:
def remove_html(text):
    soup = BeautifulSoup(text, 'html5lib') #remove HTML tokens
    text_only = soup.get_text() 
    text_normal_newline = re.sub("\n\n+", "\n", text_only)
    text_normal_space = re.sub("\s\s+", " ", text_normal_newline)
    return text_normal_space

In [None]:
training_df['text_clean'] = training_df.apply(lambda _: remove_html(_['text']), axis=1)
training_df['clean_length'] = training_df.apply(lambda _: len(_['text_clean']), axis=1)
training_df.head()

In [None]:
def process_group(row):
    return pd.Series(dict(char_cnt=row['clean_length'].sum(), record_cnt=row.clean_length.count()))
labels_df = pd.DataFrame(training_df.groupby(['tags']).apply(process_group))
labels_df['label_id'] = pd.Categorical(labels_df.index).codes
label_dict = {a: b.label_id for a, b in labels_df.iterrows()}
label_id_dict = {b.label_id: a for a, b in labels_df.iterrows()}
labels = list(label_dict.keys())
labels_df.head()


## 1.2 Loading Data (For Frequency Analysis)

In [None]:
text_dict = defaultdict(list)
with open('offsite-test-material/offsite-tagging-training-set.csv', 'r', encoding='utf8') as f:
    file_reader = csv.reader(f, delimiter=',', quotechar='"')
    next(file_reader)
    for row in file_reader:
        text_dict[row[1]].append(remove_html(row[2]))

fulltext_dict = {k: '\n'.join([_ for _ in v]) for k, v in text_dict.items()}

In [None]:
print('Found the following categories:\n{}'.format('\n'.join(['{}: {} fragments with {} characters'
                                                               .format(k, len(text_dict[k]), len(fulltext_dict[k]))
                                                               for k in text_dict.keys() ])))

This means that we have twice the frequency of articles related to soccer than to either the outgoing CE or US elections. This is a bit tricky in terms of maximum TF-IDF

## 1.2 Selecting most relevant tokens
I am building a TD-IDF-esque model, for which I will select the most 'relevant' tokens as features. Relevance here is defined as the highest ratio of frequency in the relevant 'term' over the frequency in the overall 'document'. A 'term' here is the union of all segments that belong to a single categroy. The document is the union of all segments

In [None]:
#dictionary for occurrence of short tokens in each classified doc
labeldicts_short = {_: (defaultdict(float), 0) for _ in labels} 
#dictionary for occurrence of long tokens in each classified doc
labeldicts_long = {_: (defaultdict(float), 0) for _ in labels} 
# dictionary for occurrence of short tokens in the whole document
docdict_short = defaultdict(float), 0 
# dictionary for occurrence of long tokens in the whole document
docdict_long = defaultdict(float), 0


In [None]:
# Counting token frequency
training_clean = list()
for label, combined_text in fulltext_dict.items():
    short_tokens = jieba.cut(combined_text, cut_all=True)
    for token in short_tokens:
        labeldicts_short[label][0][token] += 1
        docdict_short[0][token] += 1
        
    long_tokens = jieba.cut(combined_text, cut_all=False)
    for token in long_tokens:
        labeldicts_long[label][0][token] += 1
        docdict_long[0][token] += 1
        
        


In [None]:
# Calculating term/document length
for label in labels:
    labeldicts_long[label] = labeldicts_long[label][0], sum(labeldicts_long[label][0].values())
    labeldicts_short[label] = labeldicts_short[label][0], sum(labeldicts_short[label][0].values())
    
docdict_long = docdict_long[0], sum(docdict_long[0].values())
docdict_short = docdict_short[0], sum(docdict_short[0].values())


In [None]:
# helper function that returns the highest TFIDF of a token. 
# highly relevant tokens will have maximum TDIDFs of 2-3,  
# they exclusively occur in fragments of one class, but the size of the term length differs
# irrelevant tokens will have uniform TFIDFs of 1 (they occur everywhere with the same frequencyy)
def relative_frequency(token, classdicts, docdict, docdict_total=None):
    occurrences = [(classdict[0][token], classdict[1]) for classdict in classdicts if token in classdict[0]]
    if occurrences:
        max_occurence, term_length = max(occurrences, key=lambda _: _[0]/_[1])
        total_occurrence, doc_length  = docdict[0][token], docdict[1]
        tf = (max_occurence/term_length)
        df = (total_occurrence/doc_length)
        return (tf/df, max_occurence, total_occurrence)
    else:
        print(token)
        return 0, 0, docdict[0].get(token, 0)

relative_frequency('重賽', labeldicts_long.values(), docdict_long)

In [None]:
relevance_cutoff = 1.8
occurrent_cutoff = 50

short_classdicts = labeldicts_short.values()
long_classdicts = labeldicts_long.values()
maxfreq_short = {key: relative_frequency(key, short_classdicts, docdict_short) for key in docdict_short[0].keys()}
maxfreq_long = {key: relative_frequency(key, long_classdicts, docdict_long) for key in docdict_long[0].keys()}

In [None]:
relevant_tokens_short_list = sorted([key for key, value in maxfreq_short.items() 
                                     if (value[0] > relevance_cutoff and
                                         value[2] > occurrent_cutoff and
                                         key.isalpha())])

relevant_tokens_long_list = sorted([key for key, value in maxfreq_long.items() 
                                    if (value[0] > relevance_cutoff and
                                        value[2] > occurrent_cutoff and
                                        key.isalpha())])

## 1.3 Creating Training Data

In [None]:
def sentence_to_vector(sentence, tokenlist, cut_all=False):
    a = defaultdict(int)
    tokens = jieba.cut(sentence, cut_all=cut_all)
    for token in tokens:
        a[token] += 1
    out_dict = {_: a.get(_, 0) for _ in tokenlist}
    return pd.Series(out_dict)

occ_input_long = pd.DataFrame(training_df.text_clean.apply(
    lambda _: sentence_to_vector(_, relevant_tokens_long_list)))

occ_input_short = pd.DataFrame(training_df.text_clean.apply(
    lambda _: sentence_to_vector(_, relevant_tokens_short_list, cut_all=True)))

data_target = pd.DataFrame(training_df.merge(labels_df, how='inner', left_on='tags', right_index=True)['label_id'])    

In [None]:
test_share = 0.2
indices = list(occ_input_long.index)
test_indices = random.sample(indices, int(len(indices)*test_share))
train_indices = [_ for _ in indices if _ not in test_indices]
training_data_long = occ_input_long.loc[train_indices]
training_data_short = occ_input_short.loc[train_indices]
holdout_data_long = occ_input_long.loc[test_indices]
holdout_data_short = occ_input_short.loc[test_indices]
training_target = data_target.loc[train_indices]
holdout_target = data_target.loc[test_indices]


# 2. Training Models
## 2.0 Imports & Definitions

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [None]:
def explain_misclassification(id_, prediction, holdout):
    relevant_text = training_df.loc[id_]['text_clean']
    provided_label = training_df.loc[id_]['tags']
    predicted_label = label_id_dict[prediction[holdout.index.get_loc(id_)]]
    print('The following text was classified as {0}, but labelled as {1}:\n{2}'
          .format(predicted_label, provided_label, relevant_text))

# 2.1 Standard RandomForest Classifier

In [None]:
RFC_short = RandomForestClassifier()
RFC_short.fit(np.asarray(training_data_short), np.asarray(training_target).ravel())
rfc_prediction_short = RFC_short.predict(np.asarray(holdout_data_short))
misclassified_ids_short = list(sorted(holdout_target[rfc_prediction_short!=holdout_target['label_id']].index))
print('A RandomForest Classifier reached an accuracy score of {0:.4f} for short tokens.\nThis means that a total of {1}'
      ' fragments (out of {2} fragments in the holdout sample) was misclassified.\nThe misclassified ids are:\n{3}'
      .format(accuracy_score(rfc_prediction_short, holdout_target),
              len(misclassified_ids_short),
              len(holdout_target),
             ', '.join(str(_) for _ in misclassified_ids_short)))

In [None]:
RFC_long = RandomForestClassifier()
RFC_long.fit(np.asarray(training_data_long), np.asarray(training_target).ravel())
rfc_prediction_long = RFC_long.predict(np.asarray(holdout_data_long))
misclassified_ids_long = list(holdout_target[rfc_prediction_long!=holdout_target['label_id']].index)
print('A RandomForest Classifier reached an accuracy score of {0:.4f} for short tokens.\nThis means that a total of {1}'
      ' fragments (out of {2} fragments in the holdout sample) was misclassified.\nThe misclassified ids are:\n{3}'
      .format(accuracy_score(rfc_prediction_long, holdout_target),
              len(misclassified_ids_long),
              len(holdout_target),
             ', '.join(str(_) for _ in misclassified_ids_long)))

In [None]:
misclassified_id = 2079
explain_misclassification(misclassified_id, rfc_prediction_short, holdout_target)

## 2.2 Gradient Boosted Classifier (Standard SkLearn)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm


In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(np.asarray(training_data_short), np.asarray(training_target).ravel())

In [None]:
accuracy_score(gbc.predict(np.asarray(holdout_data_short)), holdout_target)

In [None]:
test=pd.DataFrame([[0,0,0,1,1],[0,0,0,1,2]])

In [None]:
from sklearn import svm
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

In [None]:
svc = svm.SVC(kernel=cosine_similarity)
svc.fit(training_data_long, training_target)

In [None]:
results.loc[25131].label_id

In [None]:
prediction = svc.predict(np.asarray(holdout_data_long))
acs = accuracy_score(prediction, (holdout_target))
results = holdout_target.copy()
results['prediction'] = prediction
misclassified_ids = list(results[results['prediction']!=results['label_id']].index)
for misclassified_id in misclassified_ids:
    print('The following text was classified as {}, whereas it should be {}. \n{}'
          .format(label_id_dict[results.loc[misclassified_id].prediction],
                  label_id_dict[results.loc[misclassified_id].label_id],
                  training_df.loc[misclassified_id].text_clean))


In [None]:
list(results[results['prediction']!=results['label_id']].index)

In [None]:
for input, prediction, label in zip(holdout_data_long, svc.predict(np.asarray(holdout_data_long)), holdout_target):
    if prediction != label:
        print(input, 'has been classified as ', prediction, 'and should be ', label) 

In [None]:
[holdout_target['label_id']==holdout_target['prediction']]