In [15]:
import re

import pandas as pd

import nltk

from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline #make_pipeline, make_union
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer #nltk.download('vader_lexicon')

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
# 

from collections import Counter

In [16]:
df_instances = pd.read_json("../data/instances.jsonl", lines=True);
df_truth = pd.read_json("../data/truth.jsonl", lines=True); 

# Display first 5 rows of both
display(df_instances.head())
display(df_truth.head())

Unnamed: 0,id,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle
0,608310377143799808,[],[Apple's iOS 9 'App thinning' feature will giv...,Tue Jun 09 16:31:10 +0000 2015,['App thinning' will be supported on Apple's i...,'App thinning' will be supported on Apple's iO...,"Apple,gives,gigabytes,iOS,9,app,thinning,featu...",[Paying for a 64GB phone only to discover that...,Apple gives back gigabytes: iOS 9 'app thinnin...
1,609297109095972864,[media/609297109095972864.jpg],[RT @kenbrown12: Emerging market investors are...,Fri Jun 12 09:52:05 +0000 2015,"[Stocks Fall as Investors Watch Central Banks,...",Global investors have yanked $9.3 billion from...,"emerging market,emerging markets,em flows,em i...","[Emerging markets are out of favor., Global in...",Emerging Markets Suffer Largest Outflow in Sev...
2,609504474621612032,[],[U.S. Soccer should start answering tough ques...,Fri Jun 12 23:36:05 +0000 2015,[US to vote for Ali in FIFA election and not B...,A U.S. Senator's scathing letter questioned U....,,"[WINNIPEG, Manitoba â€“ The bubble U.S. Soccer...",U.S. Soccer should start answering tough quest...
3,609748367049105408,[],[How theme parks like Disney World left the mi...,Sat Jun 13 15:45:13 +0000 2015,"[Some 1,000 persons turned out in Albuquerque,...","America's top family vacation spots, like the ...","disney, disney world, disney ticket prices, di...",[When Walt Disney World opened in an Orlando s...,How theme parks like Disney World left the mid...
4,608688782821453824,[media/608688782821453825.jpg],[Could light bulbs hurt your health? One compa...,Wed Jun 10 17:34:49 +0000 2015,[Electric lights have made the world safer and...,One company will put a health notice on all th...,"health, Should there be warning labels on your...",[(CNN)The light bulb always makes the world's ...,Warning labels on your light bulbs


Unnamed: 0,id,truthClass,truthJudgments,truthMean,truthMedian,truthMode
0,608310377143799808,no-clickbait,"[0.0, 0.6666667, 0.0, 0.33333334000000003, 0.0]",0.2,0.0,0.0
1,609297109095972864,no-clickbait,"[0.6666667, 0.0, 0.0, 0.0, 0.0]",0.133333,0.0,0.0
2,609504474621612032,clickbait,"[0.33333334000000003, 0.6666667, 1.0, 0.0, 0.6...",0.533333,0.666667,0.666667
3,609748367049105408,no-clickbait,"[1.0, 0.0, 0.33333334000000003, 0.333333340000...",0.466667,0.333333,0.333333
4,608688782821453824,clickbait,"[1.0, 0.33333334000000003, 0.6666667, 0.333333...",0.666667,0.666667,1.0


In [17]:
df_truth.groupby('truthClass').count()

Unnamed: 0_level_0,id,truthJudgments,truthMean,truthMedian,truthMode
truthClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
clickbait,762,762,762,762,762
no-clickbait,1697,1697,1697,1697,1697


In [45]:
is_number_regex = r"[0-9]+"
is_word_regex = r"[A-Za-z].*"
stemmer = SnowballStemmer("english")
sentimentAnalyzer = SentimentIntensityAnalyzer()

def extract_features(df):
    def extract(df):
        result = dict()
        extract_from_title(df[1]['targetTitle'], result)
        extract_from_article(df[1]['targetParagraphs'], result)
        extract_from_image(df[1]['postMedia'], result)
        return result
        
    def extract_from_title(title, result):
        tiny = title.strip().lower()
        title_words = nltk.word_tokenize(tiny)
        title_words_stem = [stemmer.stem(word) for word in title_words]
        title_words_number_repl = [re.sub(is_number_regex, "[n]", word) for word in title_words_stem]
        twrr_bigram_count = Counter(nltk.bigrams(title_words_number_repl))
        result['title_length'] = len(title)
        result['simple_title_words'] = len(title_words)
        result['title_words'] = len(title.split(' '))
        result['title_question_marks'] = 0 if title.find('?') == -1 else 1
        pos_tag_count = Counter(tag for (word, tag) in nltk.pos_tag(title_words))
        result.update({'pos_tag[{}]'.format(tag): count for tag, count in pos_tag_count.items()})
        result['title_average_word_length'] = len(title) / result['title_words']
        result.update({'title_bigram[{}]'.format(bigram): count for bigram, count in twrr_bigram_count.items()})
        sentiment = sentimentAnalyzer.polarity_scores(title)
        result['title_sent_neg'] = sentiment["neg"]
        result['title_sent_pos'] = sentiment["pos"]
        result['title_sent_neu'] = sentiment["neu"]
        return result
    def extract_from_article(paragraphs, result):
#         tiny = title.strip().lower()
#         title_words = nltk.word_tokenize(tiny)
        result['number_of_paragraphs'] = len(paragraphs)
        entireArticle = ''.join(paragraphs)
        result['article_length'] = len(entireArticle)
        result['article_words'] = len(entireArticle.split(' '))
        result['article_average_word_length'] = len(entireArticle) / len(entireArticle.split(' '))
    
        return result
    def extract_from_image(postMedia, result):
#         tiny = title.strip().lower()
#         title_words = nltk.word_tokenize(tiny)
        result['has_image'] = 1 if len(postMedia) > 0 else 0
    
        return result
    return map(extract, df.iterrows())

# def extract_features_titles(df):
#     def extract_from_title(title):
#         result = dict()
#         tiny = title.strip().lower()
#         title_words = nltk.word_tokenize(tiny)
#         title_words_stem = [stemmer.stem(word) for word in title_words]
#         title_words_number_repl = [re.sub(is_number_regex, "[n]", word) for word in title_words_stem]
#         twrr_bigram_count = Counter(nltk.bigrams(title_words_number_repl))
#         result['title_word_count'] = sum(1 for word in title_words if re.match(is_word_regex, word))
#         result['title_token_count'] = len(title_words)
#         pos_tag_count = Counter(tag for (word, tag) in nltk.pos_tag(title_words))
#         result.update({'pos_tag[{}]'.format(tag): count for tag, count in pos_tag_count.items()})
# #         result.update({'title_bigram[{}]'.format(bigram): count for bigram, count in twrr_bigram_count.items()})
#         return result
#     return map(extract_from_title, df['targetTitle'])

In [46]:
#list(extract_features_titles(df_instances.iloc[:5]['targetTitle']))

In [47]:
clickbaitClassifierNBA = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', MultinomialNB())
])

clickbaitClassifierTree = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', DecisionTreeClassifier())
])

clickbaitClassifierXGB = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', XGBClassifier())
])

clickbaitClassifierSVC = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', LinearSVC(max_iter=4000))
])



dummyClassifier = Pipeline([
    ('feature_extraction', FunctionTransformer(extract_features, validate=False)),
    ('encoder', DictVectorizer()),
    ('classifier', DummyClassifier())
])

In [48]:
# Do a split
merged = pd.merge(df_instances, df_truth, on='id')
X_train, X_test, y_train, y_test = train_test_split(merged, merged['truthClass'])
display(X_train.head())
display(X_test.head())

Unnamed: 0,id,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle,truthClass,truthJudgments,truthMean,truthMedian,truthMode
736,610138433202356224,[media/610138433202356227.jpg],[RT @nytopinion: Rules for seafood: Eat Americ...,Sun Jun 14 17:35:12 +0000 2015,"[The New York Times, Basic, All Access, Home D...",Eat American. Different kinds. Mostly farmed f...,"Sustainable Living,Fish and Other Marine Life,...","[NEARLY a decade ago, the writer Michael Polla...",Three Simple Rules for Eating Seafood,no-clickbait,"[0.6666667, 0.33333334000000003, 0.33333334000...",0.333333,0.333333,0.333333
309,607996802080501760,[media/607996802080501761.jpg],[Two @Royals prospects had a bet on the Arkans...,Mon Jun 08 19:45:08 +0000 2015,[@Brett_Eibner/Twitter],Former Arkansas and Missouri State baseball pl...,MLB,[For pro baseball teams filled with former col...,Minor leaguer Buddy Baumann gets unfortunate h...,clickbait,"[1.0, 0.6666667, 1.0, 0.6666667, 0.6666667]",0.8,0.666667,0.666667
972,608525869460635648,[media/608525869460635648.jpg],[RT @Femail: Meet the 'super dad' who bravely ...,Wed Jun 10 06:47:27 +0000 2015,"[Matthew Coughlan and Eugy Lim, from Childwall...","Matthew Coughlan, 31, from Childwall, Liverpoo...","Now,super,dad,Father,forced,deliver,child,surp...",[A man delivered his own baby girl before cook...,Now THIS is a super dad! Father forced to deli...,no-clickbait,"[0.6666667, 0.33333334000000003, 0.33333334000...",0.266667,0.333333,0.0
2286,607944548258422784,[media/607944548258422784.jpg],[NASA's Low-Density Supersonic Decelerator set...,Mon Jun 08 16:17:30 +0000 2015,[PHOTO: This artists concept shows the test ve...,A flying saucer could one day help a manned mi...,"mars, nasa, flying saucer, Low-Density Superso...",[A flying saucer could one day help a manned m...,Sections Shows Live Yahoo!-ABC News Network | ...,no-clickbait,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0,0.0
1309,609181375581663232,[media/609181375581663232.jpg],[VIDEO: LeBron James cut his head open after r...,Fri Jun 12 02:12:12 +0000 2015,[],Cleveland Cavaliers superstar LeBron James w...,"NBA, Breaking News",[Cleveland Cavaliers superstar LeBron James wa...,Andrew Bogut's Foul on LeBron James Results in...,no-clickbait,"[0.0, 0.33333334000000003, 0.0, 0.333333340000...",0.133333,0.0,0.0


Unnamed: 0,id,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle,truthClass,truthJudgments,truthMean,truthMedian,truthMode
937,608759561541935104,[media/608759561541935104.png],[Virgin Group working to bring year-long mater...,Wed Jun 10 22:16:04 +0000 2015,[PHOTO: British entrepreneur Richard Branson i...,Virgin Group's new generous family leave polic...,"Virgin, group, Atlantic, Richard Branson, fami...",[Virgin Group's new generous family leave poli...,Sections Shows Live Yahoo!-ABC News Network | ...,no-clickbait,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0,0.0
43,610149872059195392,[media/610149872059195392.jpg],[RT @IndyFootball: PLAYER RATINGS: Jack Wilshe...,Sun Jun 14 18:20:40 +0000 2015,[Jack-Wilshere4.jpg],England emerged unscathed from Ljubljana with ...,", International, Football, Sport",[Man-for-man marking from the Group E Euro 201...,England vs Slovenia player ratings: Jack Wilsh...,clickbait,"[0.0, 0.6666667, 0.0, 0.6666667, 0.6666667]",0.4,0.666667,0.666667
631,609337121481064448,[],[How fashion is courting the Muslim pound],Fri Jun 12 12:31:05 +0000 2015,"[Muslim shoppers on Carnaby street in London, ...",Ramadan and especially Eid increasingly signal...,"Fashion,Ramadan,Life and style,Retail industry...",[Ramadan and especially Eid increasingly signa...,How fashion is courting the Muslim pound,clickbait,"[0.33333334000000003, 0.6666667, 0.33333334000...",0.666667,0.666667,1.0
1557,608121871771430912,[media/608121871771430912.png],[The 9 best GIFs from todayâ€™s Apple event],Tue Jun 09 04:02:07 +0000 2015,"[The 9 Best GIFs From Today's Apple Event, The...",,,"[Scenes from WWDC., Â© 2017 BuzzFeed, Inc]",The 9 Best GIFs From Todayâ€™s AppleÂ Event,no-clickbait,"[0.33333334000000003, 0.6666667, 0.6666667, 0....",0.4,0.333333,0.333333
2120,609442578623299584,[media/609442578623299586.jpg],[.@jack wonâ€™t say whether heâ€™ll be Twitter...,Fri Jun 12 19:30:08 +0000 2015,[],Jack Dorsey refused on Friday to say whether h...,"jack dorsey beard,jack dorsey ceo,jack dorsey ...",[Jack Dorsey refused on Friday to say whether ...,Jack Dorsey Won't Say Whether He'll Be Twitter...,no-clickbait,"[0.0, 0.0, 0.6666667, 0.0, 0.0]",0.133333,0.0,0.0


In [53]:
clickbaitClassifierNBA.fit(X_train, y_train);
clickbaitClassifierTree.fit(X_train, y_train);
clickbaitClassifierSVC.fit(X_train, y_train);
clickbaitClassifierXGB.fit(X_train, y_train);
dummyClassifier.fit(X_train, y_train);



In [54]:
from sklearn.metrics import precision_score, recall_score

In [55]:
pred_tree = clickbaitClassifierTree.predict(X_test)
pred_nb = clickbaitClassifierNBA.predict(X_test)
pred_svc = clickbaitClassifierSVC.predict(X_test)
pred_xgb = clickbaitClassifierXGB.predict(X_test)
pred_dummy = dummyClassifier.predict(X_test)


In [56]:
preds = [pred_tree, pred_nb, pred_svc, pred_xgb, pred_dummy]

truthmap = [['clickbait', 'no-clickbait'].index(item) for item in y_test]

predsmap = [[['clickbait', 'no-clickbait'].index(item) for item in pred ] for pred in preds]


precisions = [precision_score(truthmap, pred) for pred in predsmap]
recalls = [recall_score(truthmap, pred) for pred in predsmap]
pd.DataFrame({"Classifier": ["DecisionTree", "NaiveBayes", "SVC", "XGBoost", "Dummy"], 
               "Precision": precisions,
               "Recall": recalls})

Unnamed: 0,Classifier,Precision,Recall
0,DecisionTree,0.75286,0.7723
1,NaiveBayes,0.742537,0.467136
2,SVC,0.720508,0.931925
3,XGBoost,0.754617,0.671362
4,Dummy,0.687773,0.739437


In [57]:
#tr = clickbaitClassifierTree.named_steps['classifier']
#dv = clickbaitClassifierTree.named_steps['encoder']
tr = clickbaitClassifierXGB.named_steps['classifier']
dv = clickbaitClassifierXGB.named_steps['encoder']

dfFeatureImportance = pd.DataFrame(list(zip(dv.feature_names_, tr.feature_importances_)))
dfFeatureImportance.sort_values(1, ascending=False)

Unnamed: 0,0,1
16983,title_question_marks,0.049677
15,pos_tag[DT],0.049305
16982,title_length,0.035850
47,pos_tag[WRB],0.035325
26,pos_tag[NN],0.028032
3,has_image,0.027523
0,article_average_word_length,0.027450
16987,title_words,0.026316
49,simple_title_words,0.025144
2,article_words,0.024187
