In [1]:
%matplotlib inline
import time
import datetime
import json
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import re
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style('darkgrid')
sns.set_context('poster')



In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

In [44]:
# split data into 0.8 train and 0.2 test
df = pd.read_csv('augmented.csv')
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

In [45]:
# create train and test data from headlines and labels 
train_data = list(train.article_title)
train_labels = list(train.clickbait)

test_data = list(test.article_title)
test_labels = list(test.clickbait)

# vectorize the data using tf-idf
vectorizer = TfidfVectorizer()
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

# train classifier and predict test labels
classifier = LinearSVC()
classifier.fit(vectorised_train_data, train_labels)
test['prediction'] = classifier.predict(vectorised_test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
print "accuracy:", float(sum([1 if test['clickbait'][i] == test['prediction'][i] else 0 for i in range(len(test))]))/len(test)

accuracy: 0.8673565381


In [6]:
import itertools
filtered = list(itertools.compress(test_data, test['prediction']))

# generate feature weights of a document
def feature_values(doc, representer):
    doc_representation = representer.transform([doc])
    features = representer.get_feature_names()
    return [(features[index], doc_representation[0, index]) for index in doc_representation.nonzero()[1]]

# use only words from data predicted to be clickbait
word_probs = [feature_values(doc, vectorizer) for doc in filtered]

In [7]:
word_prob_df = pd.DataFrame([item for sublist in word_probs for item in sublist])
word_prob_df.columns = ['word','prob']
word_prob_df = word_prob_df.groupby('word').agg('mean').sort('prob', ascending=False).reset_index()
word_prob_df.head(25)

  app.launch_new_instance()


Unnamed: 0,word,prob
0,plea,0.949579
1,sloths,0.712176
2,cookies,0.706477
3,69,0.704256
4,freedom,0.699532
5,forsaken,0.692393
6,horse,0.669181
7,useless,0.665236
8,blonde,0.643405
9,documented,0.639085


In [46]:
del test['Unnamed: 0']

In [53]:
# false positives
test[(test.clickbait == 0) & (test.prediction == 1)].reset_index(drop=True).head(25)

Unnamed: 0,article_title,article_url,clickbait,source,prediction
0,"With Fortune Built, Packard Heirs Look To Buil...",,0,NY Times,1
1,A RIVER RECLAIMED: Reversing Pollution's Toll ...,,0,NY Times,1
2,THE FATE OF FLIGHT 800: THE FUTURE; Once Crash...,,0,NY Times,1
3,Blair's Muted Helpmate: Look at Her Now!,,0,NY Times,1
4,Marv Albert Pleads Guilty And Is Dismissed by NBC,,0,NY Times,1
5,PICTURES OF SABRINA: A special report.; A Slid...,,0,NY Times,1
6,Ladies Who Launch,,0,NY Times,1
7,Writer Who Cried Plagiarism Used Passages She ...,,0,NY Times,1
8,CALIFORNIA'S BAN TO CLEAR SMOKE INSIDE MOST BARS,,0,NY Times,1
9,A $12 Billion Carrot for Prudential Policyholders,,0,NY Times,1


In [52]:
# false negatives
test[(test.clickbait == 1) & (test.prediction == 0)].reset_index(drop=True).head(25)

Unnamed: 0,article_title,article_url,clickbait,source,prediction
0,"FCC Will Not Fine Broadcasters For Saying ""Red...",/lindseyadler/fcc-will-not-fine-broadcasters-f...,1,Buzzfeed,0
1,Miamiâs Angry Cuban Exiles Are Not Giving Up,/davidnoriega/miamis-angry-cuban-exiles-are-no...,1,Buzzfeed,0
2,The Shay Mitchell Guide To Killing It On Insta...,/whitneyjefferson/the-shay-mitchell-guide-to-i...,1,Buzzfeed,0
3,Astro Answers 37 Random Questions,/whitneyjefferson/astro-answers-random-questions,1,Buzzfeed,0
4,7 Most Common Email Scams To Watch Out For,/article/7-most-common-email-scams-watch-out-1511,1,Clickhole,0
5,6 Best Personal License Plates For ‘Cheers’ Fans,/post/6-best-personal-license-plates-cheers-fa...,1,Clickhole,0
6,One Story Told Week By Week: Could ‘Suits’ Be ...,/article/one-story-told-week-week-could-suits-...,1,Clickhole,0
7,The 8 Most Devastating Celebrity Finishing Moves,/article/8-most-devastating-celebrity-finishin...,1,Clickhole,0
8,"Goodnight, Sweet Prince: Beloved Baseball Mons...",/article/goodnight-sweet-prince-beloved-baseba...,1,Clickhole,0
9,Litany Of Lies: A Third-Grader’s Reading Log E...,/article/litany-lies-third-graders-reading-log...,1,Clickhole,0
