In [83]:
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from scipy.sparse import coo_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline  
import matplotlib.pyplot as plt


In [84]:
trainBodies = pd.read_csv('fncData/train_bodies.csv')
trainStances = pd.read_csv('fncData/train_stances.csv')
trainStancesRandom = pd.read_csv('fncData/train_stances.random.csv')
trainStances.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [85]:
unrelatedCount = trainStances[trainStances["Stance"] == "unrelated"].count()[0]
agreeCount = trainStances[trainStances["Stance"] == "agree"].count()[0]
disagreeCount = trainStances[trainStances["Stance"] == "disagree"].count()[0]
discussCount = trainStances[trainStances["Stance"] == "discuss"].count()[0]
totalCount = trainStances.count()[0]

print("Total is ", totalCount)
print("Unrelated: ",unrelatedCount, float(unrelatedCount) / totalCount)
print("Disagree: ",disagreeCount, float(disagreeCount) / totalCount)
print("Agree: ",agreeCount, float(agreeCount) / totalCount)
print("Discuss: ",discussCount,  float(discussCount) / totalCount)

('Total is ', 49972)
('Unrelated: ', 36545, 0.7313095333386697)
('Disagree: ', 840, 0.016809413271432)
('Agree: ', 3678, 0.07360121668134155)
('Discuss: ', 8909, 0.1782798367085568)


In [86]:
trainStancesRandom.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,discuss
1,Hundreds of Palestinians flee floods in Gaza a...,158,discuss
2,"Christian Bale passes on role of Steve Jobs, a...",137,disagree
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,disagree
4,Spider burrowed through tourist's stomach and ...,1923,agree


In [87]:
limit = len(trainStances)
#limit = 1000
strip = True
lowercase = True

In [88]:
print "Train Stances (20/%s):" % (len(trainStances))
# trainStances.head(20)

trainStances["unrelated"] = trainStances["Stance"].apply(lambda x: 1 if x == "unrelated" else 0)
trainStances["agree"] = trainStances["Stance"].apply(lambda x: 1 if x == "agree" else 0)
trainStances["disagree"] = trainStances["Stance"].apply(lambda x: 1 if x == "disagree" else 0)
trainStances["discuss"] = trainStances["Stance"].apply(lambda x: 1 if x == "discuss" else 0)
trainStances["Body"] = [""] * len(trainStances)
for index, row in trainStances[0:limit].iterrows():
    trainStances.loc[index, "Body"] = trainBodies[trainBodies['Body ID'] == row["Body ID"]]['articleBody'].item()


def preprocess(text, lowercase, strip):
    text = text.lower() if lowercase else text
    text = " ".join(re.findall("[a-zA-Z]+", text)) if strip else text
    return text

if strip or lowercase:
    trainStances["Body"]= trainStances["Body"].apply(lambda x: preprocess(x,lowercase,strip))
    trainStances["Headline"]= trainStances["Headline"].apply(lambda x: preprocess(x,lowercase,strip))
    

trainStances = trainStances.drop("Body ID", 1)
trainStances = trainStances.drop("Stance", 1)
    
trainStances.head()

Train Stances (20/49972):


Unnamed: 0,Headline,unrelated,agree,disagree,discuss,Body
0,police find mass graves with at least bodies n...,1,0,0,0,danny boyle is directing the untitled film set...
1,hundreds of palestinians flee floods in gaza a...,0,1,0,0,hundreds of palestinians were evacuated from t...
2,christian bale passes on role of steve jobs ac...,1,0,0,0,year old moscow resident was hospitalized with...
3,hbo and apple in talks for month apple tv stre...,1,0,0,0,reuters a canadian soldier was shot at the can...
4,spider burrowed through tourist s stomach and ...,0,0,1,0,fear not arachnophobes the story of bunbury s ...


In [None]:
# print "Train Stances Random (20/%s)" % (len(trainStancesRandom))
# trainStancesRandom.head(20)

In [90]:
# build feature vocabulary
ngram = 1
vectorizer = CountVectorizer(ngram_range=(1, ngram))
trainStances["Number Of Common Words"] = [-1 for x in range(len(trainStances))]
trainStances["Number Of Words In Body"] = [-1 for x in range(len(trainStances))]
trainStances["Number Of Words In Headline"] = [-1 for x in range(len(trainStances))]

for index, row in trainStances[0:limit].iterrows():
    headline = row['Headline']
    body = row['Body']
    
#     print index
#     print "HEADLINE: \n%s\n" % headline
#     print "BODY: \n%s\n" % body

    headline_words = headline.split()
    body_words = body.split()
    
    num_words_in_common = len(set(headline_words).intersection(body_words))
    
    trainStances.loc[index, "Number Of Common Words"] = num_words_in_common  
    trainStances.loc[index, "Number Of Words In Body"] = len(body_words) 
    trainStances.loc[index, "Number Of Words In Headline"] = len(headline_words) 

trainStances.head()

# I'd like to graph the num_words_in_common integer to the stance classification, to see what kind of correlation we're working with here.

Unnamed: 0,Headline,unrelated,agree,disagree,discuss,Body,Number Of Common Words,Number Of Words In Body,Number Of Words In Headline
0,police find mass graves with at least bodies n...,1,0,0,0,danny boyle is directing the untitled film set...,2,198,17
1,hundreds of palestinians flee floods in gaza a...,0,1,0,0,hundreds of palestinians were evacuated from t...,10,429,11
2,christian bale passes on role of steve jobs ac...,1,0,0,0,year old moscow resident was hospitalized with...,4,187,17
3,hbo and apple in talks for month apple tv stre...,1,0,0,0,reuters a canadian soldier was shot at the can...,2,79,14
4,spider burrowed through tourist s stomach and ...,0,0,1,0,fear not arachnophobes the story of bunbury s ...,9,627,11


In [91]:
trainStances[0:limit].corr()

Unnamed: 0,unrelated,agree,disagree,discuss,Number Of Common Words,Number Of Words In Body,Number Of Words In Headline
unrelated,1.0,-0.465016,-0.215716,-0.768446,-0.67498,-0.032519,0.021148
agree,-0.465016,1.0,-0.036855,-0.13129,0.330638,-0.02545,0.007301
disagree,-0.215716,-0.036855,1.0,-0.060904,0.138234,0.001413,-0.000401
discuss,-0.768446,-0.13129,-0.060904,1.0,0.509727,0.054549,-0.029338
Number Of Common Words,-0.67498,0.330638,0.138234,0.509727,1.0,0.208033,0.463184
Number Of Words In Body,-0.032519,-0.02545,0.001413,0.054549,0.208033,1.0,0.004592
Number Of Words In Headline,0.021148,0.007301,-0.000401,-0.029338,0.463184,0.004592,1.0


In [None]:
categoryNumberOfCommonWordsMeans = { "unrelated":0, "discuss":0, "agree":0, "disagree":0}
for category in categoryNumberOfCommonWordsMeans: #for each category calculate the mean number of words in common between header and body 
    categoryNumberOfCommonWordsMeans[category] = trainStances[0:limit][trainStances[category] == 1].mean()["Number Of Common Words"]

print(categoryNumberOfCommonWordsMeans)

values = [] #in same order as traversing keys
keys = [] #also needed to preserve order
for key in categoryNumberOfCommonWordsMeans.keys():
  keys.append(key)
  values.append(categoryNumberOfCommonWordsMeans[key])
    
l = plt.bar(range(len(values)), values,align='center')
plt.xticks(range(len(keys)), keys, rotation=25)
plt.grid(True)

In [93]:
bodies = [x for x in trainStances[0:limit]["Body"]]
headlines = [x for x in trainStances[0:limit]["Headline"]]

# could make count vectors for each type
bodyBOWVectors = cv.transform(bodies)
headlineBOWVectors = cv.transform(headlines)

print("Size:", headlineBOWVectors.shape, "same" if bodyBOWVectors.shape==headlineBOWVectors.shape else "DIFFERENT!!!")

combinedVectors = bodyBOWVectors.multiply(headlineBOWVectors)

('Size:', (49972, 303435), 'same')


In [94]:
#cv=CountVectorizer()
# or tfidf vectorizer 
cv=TfidfVectorizer(ngram_range=(1,3))
cv.fit(bodies + headlines)

def featurize(xs):
    bodyVector = cv.transform([a for a in xs["Body"]])
    hlVector = cv.transform([a for a in xs["Headline"]])
    
    # maybe its better to have one vector where the presence of the word is scaled
    # up if that word is present in the headline, otherwise scaled down 
    # something of the form, but scaled with constants...
    combinedVectors = bodyVector.multiply(hlVector)
    
    numberOfWords = [[a] for a in xs["Number Of Common Words"]]
    
    return hstack([bodyVector,hlVector,combinedVectors, numberOfWords])

In [None]:
x_train = featurize(trainStances[0:limit])
y_train = list([y for y in trainStances[0:limit]["unrelated"]])

clf = LogisticRegression()
#run cross validation to get a good accuracy measure
scores = cross_val_score(clf, x_train, y_train, cv=5)

#now fit the model 
clf.fit(x_train, y_train)
# compute a confusion matrix for test accuracy
conf = confusion_matrix(y_train, clf.predict(x_train))

print "Crossvalidation", scores, ("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Coefficient for number of words is ", clf.coef_[0][-1])

print("Training error predicted postive rate:", sum(clf.predict(x_train)) / 1000.)
print("Training error false positive", conf[0][1] / float(limit))
print("Training error false negative", conf[1][0]/ float(limit))
print("Training error true positive", conf[1][1]/ float(limit))
print("Training error true negative", conf[0][0]/ float(limit))


Crossvalidation [ 0.96238119  0.96478239  0.96617971  0.96567941  0.96888133] Accuracy: 0.97 (+/- 0.00)
('Coefficient for number of words is ', -1.2295698207445114)
('Training error predicted postive rate:', 36.969000000000001)
('Training error false positive', 0.81899999999999995)
('Training error false negative', 0.39500000000000002)
('Training error true positive', 36.149999999999999)
('Training error true negative', 12.608000000000001)


In [None]:
#predictions = lr.predict(x_test)

#o = DictWriter(open("predictions.csv", 'w'), ["Id", "spoiler"])
#o.writeheader()
#for ii, pp in zip([x['Id'] for x in test], predictions):
#    d = {'Id': ii, 'spoiler': labels[pp]}
#    o.writerow(d)