10. Predict news data
11. Output CSV with 3 cols of news data: news_source, pred_cons, date

In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
import re
import string

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

#def tokenize(text):
#    tokens = nltk.word_tokenize(text)
#    stems = stem_tokens(tokens, stemmer)
#    return stems

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [3]:
vect_name = '../models/vec_count_23gram_081.joblib'
clf_name = '../models/clf_elasticnet_081.joblib'

In [4]:
vect = joblib.load(vect_name)
clf = joblib.load(clf_name)

In [5]:
len(vect.vocabulary_)

137093

In [6]:
df = pd.read_csv('../media_text/uk_media_clean_deduped_201x_gt1k.csv', usecols=['pubDate', 'src_name', 'text'])
df

Unnamed: 0,pubDate,text,src_name
0,2010-04-13 00:00:00,fundrais prepar rise challeng power walk chall...,southern daily echo
1,2011-06-21 00:00:00,powerteam secur deal windfarm staff report tue...,belfast telegraph
2,2013-03-19 00:00:00,former england star peter shilton ban road adm...,essex chronicle
3,2014-05-12 00:00:00,debo adewumi secretari nigerian commun waltham...,chingford guardian
4,2012-08-08 00:00:00,leav court dr luci dawson got sick pay one hos...,the daily mail
5,2013-06-12 00:00:00,inform maria brett took centr stage carn brea ...,cornish guardian
6,2012-02-27 03:18:00,publish monday februari prepar year parti rec ...,ilkeston advertiser
7,2012-06-14 00:00:00,councillor critic seabird centr bryan copland ...,east lothian courier
8,2010-04-28 16:16:00,main parti clash tackl britain record billion ...,banbury guardian
9,2010-06-04 08:27:00,style margin10px mickey skelton back behind st...,cambridge evening news


In [7]:
X = vect.transform(df.text.astype(str))

In [8]:
X.shape

(2742735, 137093)

In [9]:
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)

In [10]:
X.shape

(2742735, 137093)

In [11]:
y_pred = clf.predict(X)

In [12]:
y_pred.shape

(2742735,)

In [13]:
X_proba = clf.predict_proba(X)

In [14]:
X_proba

array([[ 0.57818988,  0.42181012],
       [ 0.31266354,  0.68733646],
       [ 0.55197893,  0.44802107],
       ..., 
       [ 0.33712666,  0.66287334],
       [ 0.62666108,  0.37333892],
       [ 0.37943545,  0.62056455]])

In [15]:
proba_cons = X_proba[:,0]

In [16]:
df['predicted'] = y_pred
df['pred_cons'] = proba_cons
df

Unnamed: 0,pubDate,text,src_name,predicted,pred_cons
0,2010-04-13 00:00:00,fundrais prepar rise challeng power walk chall...,southern daily echo,Conservative,0.578190
1,2011-06-21 00:00:00,powerteam secur deal windfarm staff report tue...,belfast telegraph,Labour,0.312664
2,2013-03-19 00:00:00,former england star peter shilton ban road adm...,essex chronicle,Conservative,0.551979
3,2014-05-12 00:00:00,debo adewumi secretari nigerian commun waltham...,chingford guardian,Labour,0.362759
4,2012-08-08 00:00:00,leav court dr luci dawson got sick pay one hos...,the daily mail,Labour,0.321160
5,2013-06-12 00:00:00,inform maria brett took centr stage carn brea ...,cornish guardian,Conservative,0.540253
6,2012-02-27 03:18:00,publish monday februari prepar year parti rec ...,ilkeston advertiser,Conservative,0.572750
7,2012-06-14 00:00:00,councillor critic seabird centr bryan copland ...,east lothian courier,Labour,0.342616
8,2010-04-28 16:16:00,main parti clash tackl britain record billion ...,banbury guardian,Conservative,0.624551
9,2010-06-04 08:27:00,style margin10px mickey skelton back behind st...,cambridge evening news,Labour,0.492244


In [17]:
df.columns

Index([u'pubDate', u'text', u'src_name', u'predicted', u'pred_cons'], dtype='object')

In [22]:
df.columns = ['date', 'text', 'news_source', 'predicted', 'pred_cons']
#df.to_csv('../data/uk_media_prediction_1.csv', index=False, columns=['date', 'news_source', 'pred_cons'])

In [19]:
sdf = df.groupby('news_source').agg({'pred_cons': np.mean})
sdf.reset_index(inplace=True)

In [20]:
sdf.columns = ['news_source', 'pred_cons']
sdf

Unnamed: 0,news_source,pred_cons
0,24dash,0.363675
1,a world to win blogs,0.481059
2,agra-net.com,0.494556
3,alert net,0.490065
4,alliance for workers' liberty,0.434548
5,ananova,0.486413
6,ananova - orange news,0.487675
7,andover advertiser,0.491671
8,asharq al-awsat,0.508221
9,ayrshire post,0.443086


In [21]:
sdf[sdf.pred_cons >= 0.5]

Unnamed: 0,news_source,pred_cons
8,asharq al-awsat,0.508221
32,bexhill today,0.5005
37,bognor regis observer,0.521154
38,bognor today,0.513517
46,brighton evening argus,0.510365
48,bucks free press,0.518941
49,bucks herald,0.508477
65,chichester observer,0.515214
71,country life,0.519117
83,diss mercury,0.523053
