In [1]:
import pandas as pd

In [2]:
dat = pd.read_csv("final_data.csv")

In [3]:
dat.head()

Unnamed: 0,sample_id,year,broadcast_abstract,abstract_length,broadcasts_in_year,broadcast_time,id_coder_1,id_coder_2,id_coder_3,geography_coder_1,...,welfare,election,election_date,start_broadcast_time,end_broadcast_time,date,program_title,special,evening_news,channel
0,662312,1968,Pope Paul fears some young and rebellious Cath...,1160,4560,85490,45126470,45686132,45689555,international,...,0,0,presidential,12H 21M 50S,12H 6M 40S,"Monday, Sep 23, 1968",ABC Evening News,False,True,ABC
1,134,1968,Live NBC coverage Republican National Conventi...,276,4560,3720,45126470,45544038,45526263,national,...,0,0,presidential,,,"Monday, Aug 05, 1968",NBC Special,True,False,NBC
2,661155,1968,Attorney Percy Foreman to defense James Earl R...,942,4560,170,45348963,44975113,45598929,national,...,0,0,presidential,12H 10M 20S,12H 13M 10S,"Monday, Nov 11, 1968",ABC Evening News,False,True,ABC
3,661388,1968,United States rips into North Vietnam with reg...,887,4560,240,45348963,45180724,44944563,national,...,0,0,presidential,12H 0M 20S,12H 4M 20S,"Wednesday, Nov 13, 1968",CBS Evening News,False,True,CBS
4,662270,1968,Richard Nixon begins 10 state campaign tour. I...,651,4560,180,45126470,45686132,45278443,national,...,0,0,presidential,12H 5M 10S,12H 8M 10S,"Monday, Sep 23, 1968",NBC Evening News,False,True,NBC


In [4]:
list(dat)

['sample_id',
 'year',
 'broadcast_abstract',
 'abstract_length',
 'broadcasts_in_year',
 'broadcast_time',
 'id_coder_1',
 'id_coder_2',
 'id_coder_3',
 'geography_coder_1',
 'geography_coder_2',
 'geography_coder_3',
 'news_coder_1',
 'news_coder_2',
 'news_coder_3',
 'news',
 'news_majority',
 'geography',
 'geography_majority',
 'president',
 'p_nixon',
 'p_agnew',
 'p_ford',
 'p_rockef',
 'p_carter',
 'p_mondale',
 'p_reagan',
 'p_bush',
 'p_quayle',
 'p_clinton',
 'p_gore',
 'p_cheney',
 'p_obama',
 'p_biden',
 'p_trump',
 'war',
 'economy',
 'welfare',
 'election',
 'election_date',
 'start_broadcast_time',
 'end_broadcast_time',
 'date',
 'program_title',
 'special',
 'evening_news',
 'channel']

In [5]:
dat['news'].head()

0    hard
1    hard
2    hard
3    hard
4    hard
Name: news, dtype: object

In [6]:
# Let's setup the basic NLP Supervised Learning Pipeline

import time
import re
import string

import numpy as np
import nltk
# nltk.download()
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [7]:
# Tokenize

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    text = re.sub('\d+', '[NUM]', text)
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    tokens_nostop = [x for x in tokens if x not in stopwords.words('english')]
    stems = stem_tokens(tokens_nostop, stemmer)
    return stems

vect = CountVectorizer(tokenizer=tokenize, 
                       ngram_range=(1, 3), 
                       min_df=0.005, 
                       max_df=0.5, 
                       max_features = 5000) 

In [8]:
# Train test split
X = dat[['sample_id', 'broadcast_abstract']]
y = (dat.news_majority == 'soft').astype(int)

X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)
X_test_data = X_test.copy()
X_train = X_train.broadcast_abstract
X_test = X_test.broadcast_abstract

In [9]:
X_train = vect.fit_transform(X_train)
transformer = TfidfTransformer()
X_train = transformer.fit_transform(X_train)

In [10]:
len(vect.vocabulary_)

2278

In [11]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

est = LinearSVC(penalty='l1', dual=False, tol=1e-3)

# Calibrated with isotonic calibration
clf = CalibratedClassifierCV(est, cv=2, method='isotonic')

In [12]:
t0 = time.time()
clf.fit(X_train, y_train)
t1 = time.time()
y_pred = clf.predict(X_train)
t2 = time.time()
time_clf_train = t1-t0
time_clf_predict = t2-t1
print("Results for classifier")
print("Training time: %fs; Prediction time: %fs" % (time_clf_train, time_clf_predict))
print(classification_report(y_train, y_pred))

Results for classifier
Training time: 0.127166s; Prediction time: 0.002044s
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      3374
           1       0.88      0.14      0.23       266

    accuracy                           0.94      3640
   macro avg       0.91      0.57      0.60      3640
weighted avg       0.93      0.94      0.91      3640



In [13]:
%%time
X_test = vect.transform(X_test)
transformer = TfidfTransformer()
X_test = transformer.fit_transform(X_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1442
           1       0.73      0.09      0.17       118

    accuracy                           0.93      1560
   macro avg       0.83      0.55      0.56      1560
weighted avg       0.92      0.93      0.90      1560

Wall time: 47.8 s


In [14]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-20s\t\t%.4f\t%-20s" % (coef_1, fn_1, coef_2, fn_2))

def get_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top_a = coefs_with_fns[:n]
    top_b = coefs_with_fns[:-(n + 1):-1]
    return top_a, top_b

In [15]:
show_most_informative_features(vect, clf.calibrated_classifiers_[0].base_estimator)

	-1.1224	say                 		3.3041	birthday            
	-1.0562	presidenti          		2.3262	show                
	-1.0560	case                		2.2269	num million         
	-0.9824	mexico              		2.1872	footbal             
	-0.9637	famili              		2.1301	babi                
	-0.9414	kill                		1.7912	kate                
	-0.9387	governor            		1.6472	hill                
	-0.9363	season              		1.6001	winter              
	-0.9101	due                 		1.5805	movi                
	-0.9019	hit                 		1.5441	mason               
	-0.8969	senat               		1.4908	appeal              
	-0.8251	star                		1.4399	basebal             
	-0.7980	student             		1.4194	music               
	-0.7956	decis               		1.3439	pictur              
	-0.7661	arrest              		1.2632	univers             
	-0.7154	murder              		1.2570	year                
	-0.6799	map                 		1.2427	long              

In [16]:
get_most_informative_features(vect, clf.calibrated_classifiers_[0].base_estimator)

([(-1.1223973136535317, 'say'),
  (-1.0561923901745613, 'presidenti'),
  (-1.0560461179211376, 'case'),
  (-0.9824032379821913, 'mexico'),
  (-0.9636773733023121, 'famili'),
  (-0.9413534288650499, 'kill'),
  (-0.9386734281483382, 'governor'),
  (-0.9362506851975211, 'season'),
  (-0.910056485755912, 'due'),
  (-0.9019368002132592, 'hit'),
  (-0.8969119126223121, 'senat'),
  (-0.8250805907917281, 'star'),
  (-0.7979545652348917, 'student'),
  (-0.7956173917369509, 'decis'),
  (-0.7661056479723779, 'arrest'),
  (-0.7154194530972033, 'murder'),
  (-0.6799305776878801, 'map'),
  (-0.6786670903808729, 'victim'),
  (-0.6438643557633121, 'forc'),
  (-0.6131232758313597, 'repres')],
 [(3.304119761793084, 'birthday'),
  (2.3262091555152145, 'show'),
  (2.226940817365067, 'num million'),
  (2.18718915758205, 'footbal'),
  (2.1300721365257793, 'babi'),
  (1.79116835542068, 'kate'),
  (1.6471973426821427, 'hill'),
  (1.600057829522724, 'winter'),
  (1.5804886246299301, 'movi'),
  (1.5441093201003