In [None]:
import re

import pandas as pd

import nltk

from sklearn.preprocessing import FunctionTransformer, Normalizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline #make_pipeline, make_union
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.under_sampling import RandomUnderSampler
from nltk.stem import SnowballStemmer
from nltk.tokenize.casual import casual_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer #nltk.download('vader_lexicon')

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
# 

from collections import Counter

In [None]:
df_instances = pd.read_json("../fulldata/instances.jsonl", lines=True, encoding='utf-8');
df_truth = pd.read_json("../fulldata/truth.jsonl", lines=True, encoding='utf-8'); 
# df_instances = pd.read_json("../data/instances.jsonl", lines=True, encoding='utf-8');
# df_truth = pd.read_json("../data/truth.jsonl", lines=True, encoding='utf-8'); 

In [None]:
df_instances[df_instances["targetTitle"].str.contains("’")]

In [None]:
possibleTruthClasses = ['no-clickbait', 'clickbait']

sampler = RandomUnderSampler()
merged = pd.merge(df_instances, df_truth, on='id')
merged["truthClassN"] = list(map(possibleTruthClasses.index, merged["truthClass"]))

# Display first 5 rows
# display(merged.head())

# Resample to get equal distribution.
sampled_X, sampled_y = sampler.fit_resample(merged, merged["truthClassN"])
sampled_X = pd.DataFrame(sampled_X, columns=merged.columns)

# Show distribution of classes
display(sampled_X.groupby('truthClass').count()[["id"]])

# Remove labels to avoid cheating
del sampled_X["truthClass"]
del sampled_X["truthClassN"]
del sampled_X["truthJudgments"]
del sampled_X["truthMean"]
del sampled_X["truthMedian"]
del sampled_X["truthMode"]

display(sampled_X.head())
display(sampled_y)

In [None]:
# Do a split
X_train, X_test, y_train, y_test = train_test_split(sampled_X, sampled_y)
display(X_train.head())
display(X_test.head())

In [None]:
is_number_regex = r"[0-9]+"
is_word_regex = r"[A-Za-z].*"
is_capital_word_regex = r"[A-Z].*"
is_encoding_quot = r"â€˜"
stemmer = SnowballStemmer("english")
sentimentAnalyzer = SentimentIntensityAnalyzer()

def cleanString(strin):
    return strin#re.sub(is_encoding_quot, "'", strin)

# def cleanString(strin):
#     return re.sub(is_encoding_quot, "'", strin)

def extract_features(df):
    def extract(df):
        result = dict()
        extract_from_title(cleanString(df[1]['targetTitle']), result)
        #extract_from_article(df[1]['targetParagraphs'], result)
        #extract_from_image(df[1]['postMedia'], result)
        return result
        
    def extract_from_title(title, result):
        tiny = title.strip().lower()
        title_words = nltk.word_tokenize(tiny)
        title_words_p = nltk.word_tokenize(title.strip())
        title_words_stem_pos_repl = [(re.sub(is_number_regex, "[n]", 
                                                 stemmer.stem(
                                                     word.lower())), tag) 
                                     for (word, tag) in nltk.pos_tag(title_words_p)]
        #title_words_stem = title_words
        title_words_stem = [stemmer.stem(word) for word in title_words]
        title_words_number_repl = [re.sub(is_number_regex, "[n]", word) for word in title_words_stem]
        
        pos_title_word_count = Counter(title_words_stem_pos_repl)
        title_word_count = Counter(title_words_number_repl)
        twrr_bigram_count = Counter(nltk.bigrams(title_words_number_repl))
        
        #result.update({'word_in_title[{}]'.format(word): amount for word, amount in title_word_count.items()})
        result.update({'pos_word_in_title[{}]'.format(word): 1 for word, amount in pos_title_word_count.items()})
        n_words = sum(1 for word in title_words_p if re.match(is_word_regex, word))
        n_capital_words = sum(1 for word in title_words_p if re.match(is_capital_word_regex, word))
        result['capital_vs_non_words_ratio'] = 0 if n_words == 0 else n_capital_words/n_words
        #pos_tag_count = Counter(tag for (word, tag) in nltk.pos_tag(title_words))
        pos_tag_count = Counter(tag for (word, tag) in nltk.pos_tag(title_words_p))
        result['title_length'] = len(title)
        result['simple_title_words'] = len(title_words)
        result['title_words'] = len(title.split(' '))
        result['title_question_marks'] = 0 if title.find('?') == -1 else 1
        result.update({'pos_tag[{}]'.format(tag): count for tag, count in pos_tag_count.items()})
        result['title_average_word_length'] = len(title) / result['title_words']
        result.update({'title_bigram[{}]'.format(bigram): count for bigram, count in twrr_bigram_count.items()})
        sentiment = sentimentAnalyzer.polarity_scores(title)
        result['title_sent_neg'] = sentiment["neg"]
        result['title_sent_pos'] = sentiment["pos"]
        result['title_sent_neu'] = sentiment["neu"]
        return result
    def extract_from_article(paragraphs, result):
#         tiny = title.strip().lower()
#         title_words = nltk.word_tokenize(tiny)
        result['number_of_paragraphs'] = len(paragraphs)
        entireArticle = ''.join(paragraphs)
        result['article_length'] = len(entireArticle)
        result['article_words'] = len(entireArticle.split(' '))
        result['article_average_word_length'] = len(entireArticle) / len(entireArticle.split(' '))
    
        return result
    def extract_from_image(postMedia, result):
#         tiny = title.strip().lower()
#         title_words = nltk.word_tokenize(tiny)
        result['has_image'] = 1 if len(postMedia) > 0 else 0
    
        return result
    return map(extract, df.iterrows())

# def extract_features_titles(df):
#     def extract_from_title(title):
#         result = dict()
#         tiny = title.strip().lower()
#         title_words = nltk.word_tokenize(tiny)
#         title_words_stem = [stemmer.stem(word) for word in title_words]
#         title_words_number_repl = [re.sub(is_number_regex, "[n]", word) for word in title_words_stem]
#         twrr_bigram_count = Counter(nltk.bigrams(title_words_number_repl))
#         result['title_word_count'] = sum(1 for word in title_words if re.match(is_word_regex, word))
#         result['title_token_count'] = len(title_words)
#         pos_tag_count = Counter(tag for (word, tag) in nltk.pos_tag(title_words))
#         result.update({'pos_tag[{}]'.format(tag): count for tag, count in pos_tag_count.items()})
# #         result.update({'title_bigram[{}]'.format(bigram): count for bigram, count in twrr_bigram_count.items()})
#         return result
#     return map(extract_from_title, df['targetTitle'])

In [None]:
#list(extract_features_titles(df_instances.iloc[:5]['targetTitle']))

In [None]:
clickbaitClassifierNBA = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', MultinomialNB())
])

clickbaitClassifierTree = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', DecisionTreeClassifier())
])

clickbaitClassifierXGB = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', XGBClassifier())
])

clickbaitClassifierSVC = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('normalizer', Normalizer()),
    ('classifier', LinearSVC(max_iter=4000))
])



dummyClassifier = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', DummyClassifier(strategy="most_frequent"))
])

In [None]:
clickbaitClassifierNBA.fit(X_train, y_train);
clickbaitClassifierTree.fit(X_train, y_train);
clickbaitClassifierSVC.fit(X_train, y_train);
#clickbaitClassifierXGB.fit(X_train, y_train);
dummyClassifier.fit(X_train, y_train);

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
pred_tree = clickbaitClassifierTree.predict(X_test)
pred_nb = clickbaitClassifierNBA.predict(X_test)
pred_svc = clickbaitClassifierSVC.predict(X_test)
#pred_xgb = clickbaitClassifierXGB.predict(X_test)
pred_dummy = dummyClassifier.predict(X_test)


In [None]:
#preds = [pred_tree, pred_nb, pred_svc, pred_xgb, pred_dummy]
preds = [pred_tree, pred_nb, pred_svc, pred_dummy]
#classifiers = ["DecisionTree", "NaiveBayes", "SVC", "XGBoost", "Dummy"]
classifiers = ["DecisionTree", "NaiveBayes", "SVC", "Dummy"]

truthmap = y_test
predsmap = preds
#truthmap = [['clickbait', 'no-clickbait'].index(item) for item in y_test]
#predsmap = [[['clickbait', 'no-clickbait'].index(item) for item in pred ] for pred in preds]

precisions = [precision_score(truthmap, pred) for pred in predsmap]
recalls = [recall_score(truthmap, pred) for pred in predsmap]
accuracies = [accuracy_score(truthmap, pred) for pred in predsmap]
cfm = [confusion_matrix(truthmap, pred) for pred in predsmap]

pd.DataFrame({"Classifier": classifiers, 
               "Accuracy": accuracies,
               "Precision": precisions,
               "Recall": recalls,
               "CFM": cfm})

In [None]:
#pd.concat([no_stemming_results, stemming_results])

In [None]:
tr = clickbaitClassifierTree.named_steps['classifier']
dv = clickbaitClassifierTree.named_steps['encoder']
#tr = clickbaitClassifierXGB.named_steps['classifier']
#dv = clickbaitClassifierXGB.named_steps['encoder']

dfFeatureImportance = pd.DataFrame(list(zip(dv.feature_names_, tr.feature_importances_)))
dfOrdered = dfFeatureImportance.sort_values(1, ascending=False)
display(dfOrdered[dfOrdered[1] > 0.00001].head(200))

In [None]:
#display(X_train.iloc[4]['targetTitle'])
#display(cleanString(X_train.iloc[4]['targetTitle']))

In [None]:
from sklearn.tree import export_graphviz
from graphviz import Digraph

feature_regex = r"X\[([0-9]+)\]"
value_regex = r"value \= \[([0-9]+), ([0-9]+)\]"

features_lst = dv.feature_names_

ttab = str.maketrans({
        #'\'': "",
        #'[' : "",
        #']' : "",
        #'(' : "",
        #')' : "",
        #' ' : "",
        #',' : "",
        #'_' : "",
        #'$' : "",
        '"' : "\\\""
    })

def replace_with_names(line):
    def repl_feature(match):
        a = "\'{}\'".format(features_lst[int(match.group(1))].translate(ttab))
        #print(a)
        return a
    def repl_value(match):
        a = "Clickbait? No:{}, Yes:{}".format(match.group(1),match.group(2))
        #print(a)
        return a
    linerxr = re.sub(feature_regex, repl_feature, line)
    linerxr = re.sub(value_regex, repl_value, linerxr)
    return linerxr

dotf = [replace_with_names(line) for line in export_graphviz(tr).split('\n')[1:-1]]

Digraph(body=dotf)

In [None]:
predicted = pred_nb
old_colwidth = pd.options.display.max_colwidth
pd.options.display.max_colwidth = 200
Xtst = X_test.copy()
Xtst["predicted"] = predicted
tocmp = pd.merge(Xtst[[pred_i != corr_i for (pred_i, corr_i) in zip(predicted, y_test)]], merged, on="id")
display(tocmp[["id", "predicted", "truthClassN_y", "targetTitle_y", "postText_y", "truthMean", "truthMedian", "truthJudgments"]])
pd.options.display.max_colwidth = old_colwidth