In [None]:
import re

import pandas as pd

import nltk

from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline #make_pipeline, make_union
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import SnowballStemmer
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
# 

from collections import Counter

In [None]:
df_instances = pd.read_json("../data/instances.jsonl", lines=True);
df_truth = pd.read_json("../data/truth.jsonl", lines=True); 

# Display first 5 rows of both
display(df_instances.head())
display(df_truth.head())

In [None]:
df_truth.groupby('truthClass').count()

In [None]:
is_number_regex = r"[0-9]+"
is_word_regex = r"[A-Za-z].*"
is_capital_word_regex = r"[A-Z].*"
is_encoding_quot = r"â€˜"
stemmer = SnowballStemmer("english")

def cleanString(strin):
    return strin#re.sub(is_encoding_quot, "'", strin)

def extract_features(df):
    def extract(df):
        result = dict()
        extract_from_title(cleanString(df[1]['targetTitle']), result)
        #extract_from_article(df[1]['targetParagraphs'], result)
        #extract_from_image(df[1]['postMedia'], result)
        return result
        
    def extract_from_title(title, result):
        tiny = title.strip().lower()
        title_words = nltk.word_tokenize(tiny)
        title_words_p = nltk.word_tokenize(title.strip())
        title_words_stem = [stemmer.stem(word) for word in title_words]
        title_words_number_repl = [re.sub(is_number_regex, "[n]", word) for word in title_words_stem]
        
        title_word_count = Counter(title_words_number_repl)
        twrr_bigram_count = Counter(nltk.bigrams(title_words_number_repl))
        
        result.update({'word_in_title[{}]'.format(word): 1 for word, amount in title_word_count.items()})
        #result['title_length'] = len(title)
        #result['simple_title_words'] = len(title_words)
        n_words = sum(1 for word in title_words_p if re.match(is_word_regex, word))
        n_capital_words = sum(1 for word in title_words_p if re.match(is_capital_word_regex, word))
        #result['words_starting_with_capital_count'] = sum(1 for word in title_words_p if re.match(is_capital_word_regex, word))
        result['capital_vs_non_words_ratio'] = n_capital_words/n_words
        #result['title_words'] = len(title.split(' '))
        result['title_question_marks'] = 0 if title.find('?') == -1 else 1
        pos_tag_count = Counter(tag for (word, tag) in nltk.pos_tag(title_words))
        #result.update({'pos_tag[{}]'.format(tag): count for tag, count in pos_tag_count.items()})
        #result['title_average_word_length'] = len(title) / result['title_words']
        #pos_tag_bigram_count = Counter(nltk.bigrams(tag for (word, tag) in nltk.pos_tag(title_words)))
        #result.update({'pos_tag_bigram[{}]'.format(bitag): count for bitag, count in pos_tag_bigram_count.items()})
        result.update({'title_bigram[{}]'.format(bigram): count for bigram, count in twrr_bigram_count.items()})
        return result
    def extract_from_article(paragraphs, result):
#         tiny = title.strip().lower()
#         title_words = nltk.word_tokenize(tiny)
        result['number_of_paragraphs'] = len(paragraphs)
        entireArticle = ''.join(paragraphs)
        result['article_length'] = len(entireArticle)
        result['article_words'] = len(entireArticle.split(' '))
        result['article_average_word_length'] = len(entireArticle) / len(entireArticle.split(' '))
    
        return result
    def extract_from_image(postMedia, result):
#         tiny = title.strip().lower()
#         title_words = nltk.word_tokenize(tiny)
        result['has_image'] = 1 if len(postMedia) > 0 else 0
    
        return result
    return map(extract, df.iterrows())

# def extract_features_titles(df):
#     def extract_from_title(title):
#         result = dict()
#         tiny = title.strip().lower()
#         title_words = nltk.word_tokenize(tiny)
#         title_words_stem = [stemmer.stem(word) for word in title_words]
#         title_words_number_repl = [re.sub(is_number_regex, "[n]", word) for word in title_words_stem]
#         twrr_bigram_count = Counter(nltk.bigrams(title_words_number_repl))
#         result['title_word_count'] = sum(1 for word in title_words if re.match(is_word_regex, word))
#         result['title_token_count'] = len(title_words)
#         pos_tag_count = Counter(tag for (word, tag) in nltk.pos_tag(title_words))
#         result.update({'pos_tag[{}]'.format(tag): count for tag, count in pos_tag_count.items()})
# #         result.update({'title_bigram[{}]'.format(bigram): count for bigram, count in twrr_bigram_count.items()})
#         return result
#     return map(extract_from_title, df['targetTitle'])

In [None]:
#list(extract_features_titles(df_instances.iloc[:5]['targetTitle']))

In [None]:
clickbaitClassifierNBA = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', MultinomialNB())
])

clickbaitClassifierTree = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', DecisionTreeClassifier(
        min_samples_leaf=10,
        criterion="entropy"))
])

clickbaitClassifierXGB = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', XGBClassifier())
])

clickbaitClassifierSVC = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', LinearSVC(max_iter=4000))
])



dummyClassifier = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', DummyClassifier(strategy="most_frequent"))
])

In [None]:
# Do a split
merged = pd.merge(df_instances, df_truth, on='id')
X_train, X_test, y_train, y_test = train_test_split(merged, merged['truthClass'])
display(X_train.head())
display(X_test.head())

In [None]:
clickbaitClassifierNBA.fit(X_train, y_train);
clickbaitClassifierTree.fit(X_train, y_train);
clickbaitClassifierSVC.fit(X_train, y_train);
#clickbaitClassifierXGB.fit(X_train, y_train);
dummyClassifier.fit(X_train, y_train);

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
pred_tree = clickbaitClassifierTree.predict(X_test)
pred_nb = clickbaitClassifierNBA.predict(X_test)
pred_svc = clickbaitClassifierSVC.predict(X_test)
#pred_xgb = clickbaitClassifierXGB.predict(X_test)
pred_dummy = dummyClassifier.predict(X_test)


In [None]:
#preds = [pred_tree, pred_nb, pred_svc, pred_xgb, pred_dummy]
preds = [pred_tree, pred_nb, pred_svc, pred_dummy]
#classifiers = ["DecisionTree", "NaiveBayes", "SVC", "XGBoost", "Dummy"]
classifiers = ["DecisionTree", "NaiveBayes", "SVC", "Dummy"]

truthmap = [['clickbait', 'no-clickbait'].index(item) for item in y_test]
predsmap = [[['clickbait', 'no-clickbait'].index(item) for item in pred ] for pred in preds]

precisions = [precision_score(truthmap, pred) for pred in predsmap]
recalls = [recall_score(truthmap, pred) for pred in predsmap]
accuracies = [accuracy_score(truthmap, pred) for pred in predsmap]
cfm = [confusion_matrix(truthmap, pred) for pred in predsmap]

pd.DataFrame({"Classifier": classifiers, 
               "Accuracy": accuracies,
               "Precision": precisions,
               "Recall": recalls,
               "CFM": cfm})

In [None]:
tr = clickbaitClassifierTree.named_steps['classifier']
dv = clickbaitClassifierTree.named_steps['encoder']
#tr = clickbaitClassifierXGB.named_steps['classifier']
#dv = clickbaitClassifierXGB.named_steps['encoder']

dfFeatureImportance = pd.DataFrame(list(zip(dv.feature_names_, tr.feature_importances_)))
dfFeatureImportance.sort_values(1, ascending=False)

In [None]:
display(X_train.iloc[4]['targetTitle'])
display(cleanString(X_train.iloc[4]['targetTitle']))

In [None]:
from sklearn.tree import export_graphviz
from graphviz import Digraph

feature_regex = r"X\[([0-9]+)\]"

features_lst = dv.feature_names_

ttab = str.maketrans({
        #'\'': "",
        #'[' : "",
        #']' : "",
        #'(' : "",
        #')' : "",
        #' ' : "",
        #',' : "",
        #'_' : "",
        #'$' : "",
        '"' : "\\\""
    })

def replace_feature_names(line):
    def repl(match):
        a = "\'{}\'".format(features_lst[int(match.group(1))].translate(ttab))
        #print(a)
        return a
    linerxr = re.sub(feature_regex, repl, line)
    return linerxr

dotf = [replace_feature_names(line) for line in export_graphviz(tr).split('\n')[1:-1]]

Digraph(body=dotf)