-
Notifications
You must be signed in to change notification settings - Fork 0
/
cross_cult.py
82 lines (69 loc) · 2.34 KB
/
cross_cult.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pickle
import math
import numpy as np
from scipy.sparse import csr_matrix
from features.vectorizer import PolitenessFeatureVectorizer
MODEL_FILENAME = 'politeness-svm.p'
clf = pickle.load(open(MODEL_FILENAME,'rb'))
vectorizer = PolitenessFeatureVectorizer()
def documents2feature_vectors(documents):
print "Generating feature vectors"
X= []
for d in documents:
fs = vectorizer.features(d)
fks = sorted(fs.keys())
fv = [fs[f] for f in fks]
X.append(fv)
X = csr_matrix(np.asarray(X))
return X
def getScores(sub):
# sub should be a list of requests
# each having indexed deps and tokenized sentences
X = documents2feature_vectors(sub)
scores = clf.predict(X)
return scores
# DATA_FILE = 'data/reddit_parsed.p'
# redditData = pickle.load(open(DATA_FILE,'rb'))
# predictions = {'US':{},'UK':{}}
# for country, subs in redditData.iteritems():
# result = {}
# for sub, data in subs.iteritems():
# print "Getting scores for subreddit -", sub, "with", len(data), "requests"
# result[sub] = getScores(data)
# predictions[country] = result
# for country, subs in predictions.iteritems():
# for sub, scores in subs.iteritems():
# print sub, np.mean(scores)
def getMeanScores(pickle_in):
redditData = pickle.load(open(pickle_in,'rb'))
predictions = dict.fromkeys(redditData.keys())
for sub, reqs in redditData.iteritems():
print "Getting scores for subreddit -", sub, "with", len(reqs[1]), "requests"
predictions[sub] = getScores(reqs[1])
# Population stats
n, u0 = 0, 0.0
for sub, scores in predictions.iteritems():
n += len(scores)
u0 += np.sum(scores)
u0 = u0/n
print "Total n =", n, "U0 = ", u0
# t-vals
for sub, scores in predictions.iteritems():
print "\n======", sub, "=======\n"
n_sub = len(scores)
print "\nn =", n_sub
s = np.sqrt(np.sum(np.square(scores - u0)) / float(n_sub-1))
print "s =", s
x_dash = np.mean(scores)
print "x_dash =", x_dash
t = (x_dash - u0)/(s/n_sub)
print "t =", t
# average politeness of each subreddit
for sub, scores in predictions.iteritems():
print sub, " - ", np.mean(scores)
DATA_PATH = 'data/parsed/'
# getMeanScores(DATA_PATH+'reddit_us_ind_parsed.p')
# getMeanScores(DATA_PATH+'reddit_atlanta_london_parsed.p')
# getMeanScores(DATA_PATH+'reddit_newyork_sf_parsed.p')
getMeanScores(DATA_PATH+'reddit_religion_parsed.p')
# getMeanScores(DATA_PATH+'reddit_lib_cons_parsed.p')