In [None]:
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
dataset = pd.read_csv('../data/bq-results-20200205-092131-ioej0ewh4vcc.csv')

dataset.head()

In [None]:
def filter_na_tags(df):
    return df.loc[~df.tag_name.isna()]

def filter_tags_with_less_than_x_samples(x):
    def filter_function(df):
        tag_counts = df.groupby('tag_name')['post_id'].nunique()
        tags_with_at_least_10 = tag_counts[tag_counts >= x].index.values
        return df.loc[df.tag_name.isin(tags_with_at_least_10)]
    return filter_function

def group_tags_by_post(df):
    return df.groupby('post_id').apply(
        lambda grp: pd.Series({
            'title': grp['title'].iloc[0],
            'tag_names': '|'.join(grp['tag_name'].values)
        })
    )

processed_dataset = dataset.copy()\
    .pipe(filter_na_tags)\
    .pipe(filter_tags_with_less_than_x_samples(100))\
    .pipe(group_tags_by_post)

tag_set = list(set(tag for tag_names in processed_dataset['tag_names'].values for tag in tag_names.split('|')))

print(f"Dataset with {processed_dataset.shape[0]} samples and {len(tag_set)} labels")
processed_dataset.head()

In [None]:
# train, test, validation split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multiclass import OneVsRestClassifier

tag_vectorizer = CountVectorizer(vocabulary=tag_set, binary=True, tokenizer=lambda x: x.split(' '))
tag_matrix = tag_vectorizer.fit_transform(
    [' '.join(tag_names.split('|')) for tag_names in processed_dataset['tag_names'].values]
)

x_train, x_non_train, y_train, y_non_train = train_test_split(
    processed_dataset['title'], tag_matrix, 
    shuffle=False, # for debugging purposes
    train_size=0.8)

x_test, x_val, y_test, y_val = train_test_split(
    x_non_train, y_non_train, 
    train_size=0.5)

pipeline = Pipeline([
    ('feature_extraction', HashingVectorizer()),
    ('multilabel_model', OneVsRestClassifier(SGDClassifier()))
])

model = pipeline.fit(x_train, y_train)

y_test_hat = model.predict(x_test)

In [None]:
print(classification_report(y_test_hat, y_test, target_names=tag_set))