In [1]:
import sys
import os
import cPickle
import numpy as np
import scipy
from scipy.sparse import csr_matrix
import sklearn
import nltk
from features.vectorizer import PolitenessFeatureVectorizer

In [3]:
MODEL_FILENAME = 'politeness-svm.p'
clf = cPickle.load(open(MODEL_FILENAME))
vectorizer = PolitenessFeatureVectorizer()


In [4]:
def score(request):
    """
    :param request - The request document to score
    :type request - dict with 'sentences' and 'parses' field
        sample (taken from test_documents.py)--
        {
            'sentences': [
                "Have you found the answer for your question?", 
                "If yes would you please share it?"
            ],
            'parses': [
                ["csubj(found-3, Have-1)", "dobj(Have-1, you-2)", "root(ROOT-0, found-3)", "det(answer-5, the-4)", "dobj(found-3, answer-5)", "poss(question-8, your-7)", "prep_for(found-3, question-8)"], 
                ["prep_if(would-3, yes-2)", "root(ROOT-0, would-3)", "nsubj(would-3, you-4)", "ccomp(would-3, please-5)", "nsubj(it-7, share-6)", "xcomp(please-5, it-7)"]
            ]
        } 

    returns class probabilities as a dict
        {
            'polite': float, 
            'impolite': float
        }
    """
    # vectorizer returns {feature-name: value} dict
    features = vectorizer.features(request)
    fv = [features[f] for f in sorted(features.iterkeys())]
    # Single-row sparse matrix
    X = csr_matrix(np.asarray([fv]))
    print clf
    print X
    probs = clf.predict_proba(X)
    # Massage return format
    probs = {"polite": probs[0][1], "impolite": probs[0][0]}
    return probs


In [5]:
if __name__ == "__main__":

    """
    Sample classification of requests
    """

    from test_documents import TEST_DOCUMENTS

    for doc in TEST_DOCUMENTS:

        probs = score(doc)

        print "===================="
        print "Text: ", doc['text']
        print "\tP(polite) = %.3f" % probs['polite']
        print "\tP(impolite) = %.3f" % probs['impolite']
        print "\n"



SVC(C=0.02, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
  (0, 115)	1
  (0, 266)	1
  (0, 343)	1
  (0, 452)	1
  (0, 501)	1
  (0, 640)	1
  (0, 675)	1
  (0, 695)	1
  (0, 731)	1
  (0, 745)	1
  (0, 748)	1
  (0, 812)	1
  (0, 956)	1
  (0, 959)	1
  (0, 1020)	1
  (0, 1135)	1
  (0, 1154)	1
  (0, 1259)	1
  (0, 1344)	1
  (0, 1350)	1
  (0, 1352)	1
  (0, 1353)	1
  (0, 1360)	1
  (0, 1375)	1
  (0, 1377)	1
Text:  Have you found the answer for your question? If yes would you please share it?
	P(polite) = 0.719
	P(impolite) = 0.281


SVC(C=0.02, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
  (0, 27)	1
  (0, 95)	1
  (0, 337)	1
  (0, 591)	1
  (0, 696)	1
  (0, 710)	1
  (0, 726)	1
  (0, 731)	1
  (0, 747)	1
  (0, 761)	1
  (0, 811)	1
  (0, 1018)



In [6]:
packages2versions = [("scikit-learn", sklearn, "0.15.1"), ("numpy", np, "1.9.0"), ("nltk", nltk, "3.0.0"), ("scipy", scipy, "0.12.0")]

for name, package, expected_v in packages2versions:
    if package.__version__ < expected_v:
        print("Warning: package '%s', expected version >= %s, detected %s. Code functionality not guaranteed.\n" % (name, expected_v, package.__version__))



In [7]:
unigrams=cPickle.load(open("features/featunigrams.p"))

aks="I am not a great person, hahaha"

In [8]:
nltk.word_tokenize(aks)

['I', 'am', 'not', 'a', 'great', 'person', ',', 'hahaha']

In [82]:
pos_filename="features/liu-positive-words.txt"

In [83]:
positive_words = set(map(lambda x: x.strip(), open(pos_filename).read().splitlines()))

In [54]:
has_positive = lambda l: len(positive_words.intersection(l)) > 0
has_positive.__name__ = "HASPOSITIVE"

In [55]:
import csv
filename='stack-exchange.annotated.csv'
f = open(filename)
csv_f = csv.reader(f)
from nltk import word_tokenize
vals=[]
i=0
csv_f.next()
# for rows in csv_f:
#     #print word_tokenize(rows[2])
#     try:
#         if len(positive_words.intersection(word_tokenize(rows[2]))) > 0:
#             vals.append(float(rows[-1]))
#     except:
#         print rows[2]
filename='wikipedia.annotated.csv'
f = open(filename)
csv_f = csv.reader(f)
for rows in csv_f:
    #print word_tokenize(rows[2])
    try:
        l=[x.lower() for x in word_tokenize(rows[2])]
        if len(positive_words.intersection(l)) > 0:
            vals.append(float(rows[-1]))
    except:
        print rows[2]
#     i+=1
#     if i> 100:
#         break
    
                  
print sum(vals)/len(vals)

0.115403712172


In [60]:
def generate_table():
    # file_name=
    # matrix=[]
    # vals=[]
    # documents=pickle.load(open(file_name))
    # for vals in documents:
    #     matrix.append(get_politeness_strategy_features(vals))
    #     scores.append(vals[scores])
    matrix=[[1, 0, 0], [1,1,0], [0,0,1], [1,1,1]]
    scores=[2.,1.,4., -1.]
    matrix=np.array(matrix)
    scores=np.array(scores)
    mult=np.multiply(matrix.T, scores.T).T
    mult=np.sum(mult, axis=0)
    total=np.sum(matrix, axis=0)
    results=np.divide(mult, total)
    results=results.tolist()
    POLITENESS_FEATURES=["1", "2", "3"]
    for i, f in enumerate(POLITENESS_FEATURES):
        print f,  results[i]

In [61]:
generate_table()

1 0.666666666667
2 0.0
3 1.5


In [77]:
import csv
filename="wikipedia.annotated.csv"

f = open(filename)
csv_f = csv.reader(f)
text_corpus=[]
csv_f.next()
for row in csv_f:
    text_corpus.append((row[2],float(row[-1])))
text_corpus.sort(key=lambda l: l[1], reverse=True)
top=[]
for i in range(len(text_corpus)/4):
    top.append(text_corpus[i])

In [84]:
i=0
vals=[]
for rows in top:
    #print word_tokenize(rows[2])
    #print rows[1]
    try:
        l=[x.lower() for x in word_tokenize(rows[0])]
        if len(positive_words.intersection(l)) > 0:
            vals.append(float(rows[-1]))
    except:
        print rows[0]
#     i+=1
#     if i> 100:
#         break
a=len(vals)
vals=[]
for rows in text_corpus:
    #print word_tokenize(rows[2])
    #print rows[1]
    try:
        l=[x.lower() for x in word_tokenize(rows[0])]
        if len(positive_words.intersection(l)) > 0:
            vals.append(float(rows[-1]))
    except:
        print rows[0] 
b=len(vals)
# print len(vals)/float(len(top))        
# print sum(vals)/len(vals)
print a/float(b)

0.311945748565
