In [1]:
from scipy import stats
import numpy as np

In [2]:
import re
import spacy
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
stemmer = SnowballStemmer("english")
nlp = spacy.load('en')
MAX_CHARS = 20000
def tokenizer(comment):
    comment = comment.lower()
    comment = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’;#]`", " ", str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\,+", ",", comment)
    if (len(comment) > MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return [x.text for x in nlp.tokenizer(comment) if x.text != " "]

In [18]:
import csv
import collections
import string
from stop_words import get_stop_words

stop_words = get_stop_words('en')

word_dict = collections.defaultdict(list)
polarities = []

with open('data/csv/train/boots.csv') as csvDataFile:
    csvReader = csv.reader(csvDataFile)
    next(csvReader)
    for row in csvReader:
        sentiment = row[1]
        if sentiment == 'negative':
            polarity = -1
        else:
            polarity = 1
        polarities.append(polarity)
        texts = row[2]
        segs = tokenizer(texts)
        for seg in segs:
            if seg == 'edu_break' or seg == '-lrb-' or seg == '-rrb-' or seg in string.punctuation or seg.isdigit() or seg in stop_words or seg[0] == "'":
                continue
            word_dict[seg.lower()].append(polarity)
            

In [19]:
mean_polarity = stats.tmean(polarities)

In [20]:
counts = []
for word in word_dict:
    counts.append(len(word_dict[word]))
counts = np.array(counts)
hist,bins=np.histogram(counts,bins=np.linspace(0,600000,600))

In [21]:
stats.tmean(counts)

87.0166277030976

In [22]:
stats.tstd(counts)

994.1513870226642

In [23]:
stats.scoreatpercentile(counts, 99)

1623.859999999986

In [24]:
print(hist)

[33741   185    88    57    36    22    16    13     3    10     8     7
     5     3     3     2     1     5     1     0     2     0     1     1
     1     1     1     1     0     0     0     0     0     0     0     0
     1     0     1     0     1     1     0     0     0     0     0     0
     0     1     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     1     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [25]:
print(bins)

[     0.           1001.66944908   2003.33889816   3005.00834725
   4006.67779633   5008.34724541   6010.01669449   7011.68614357
   8013.35559265   9015.02504174  10016.69449082  11018.3639399
  12020.03338898  13021.70283806  14023.37228715  15025.04173623
  16026.71118531  17028.38063439  18030.05008347  19031.71953255
  20033.38898164  21035.05843072  22036.7278798   23038.39732888
  24040.06677796  25041.73622705  26043.40567613  27045.07512521
  28046.74457429  29048.41402337  30050.08347245  31051.75292154
  32053.42237062  33055.0918197   34056.76126878  35058.43071786
  36060.10016694  37061.76961603  38063.43906511  39065.10851419
  40066.77796327  41068.44741235  42070.11686144  43071.78631052
  44073.4557596   45075.12520868  46076.79465776  47078.46410684
  48080.13355593  49081.80300501  50083.47245409  51085.14190317
  52086.81135225  53088.48080134  54090.15025042  55091.8196995
  56093.48914858  57095.15859766  58096.82804674  59098.49749583
  60100.16694491  61101.836

In [26]:
np.random.seed(1)

In [27]:
stats.ttest_1samp(word_dict['annoying'],mean_polarity)

Ttest_1sampResult(statistic=-14.627982435972637, pvalue=1.8651023141997468e-39)

In [28]:
stats.ttest_1samp(word_dict['awesome'],mean_polarity)

Ttest_1sampResult(statistic=19.665499585596194, pvalue=4.596013320594047e-79)

In [29]:
#select seeds


pool = []
for word in word_dict:
    if len(word_dict[word]) < 500:
        continue
    ttest = stats.ttest_1samp(word_dict[word],mean_polarity)
    pool.append((word, ttest.statistic, ttest.pvalue))

In [30]:
words, tstats, pvals = zip(*pool)

In [31]:
pvals

(1.3561926481768183e-109,
 0.02304359845972466,
 1.1873389755133715e-136,
 1.3835625072891844e-05,
 1.0116653944459284e-54,
 3.023740460136704e-82,
 7.418124021179201e-13,
 3.9694359611368966e-05,
 0.0,
 0.0,
 4.6080913539177784e-29,
 3.629254373381242e-46,
 0.08790720618365781,
 1.837913484032794e-55,
 2.4897801445935978e-15,
 1.7836341854833696e-118,
 1.0998301118345582e-154,
 4.107722421330805e-08,
 5.2061486706239166e-23,
 0.00015166704393666114,
 0.24944170192382412,
 0.22743184029655703,
 3.790202637980862e-33,
 3.687454021043798e-15,
 0.0,
 1.0772155202225417e-61,
 2.331378132248588e-108,
 0.42557608154094473,
 1.380743761391206e-17,
 1.7570683897277958e-104,
 3.6880800230079206e-31,
 1.6328867623243562e-38,
 9.549566513678539e-49,
 1.5816144371677048e-33,
 2.9993005694094385e-94,
 1.14453052200258e-59,
 7.669294850104932e-33,
 4.395330687948881e-153,
 4.711180908163966e-05,
 0.0,
 1.7564012594058958e-11,
 5.111393518776908e-151,
 4.6066236963448344e-17,
 3.865482257446332e-13,


In [34]:
seeds = []
pos_count = 0
neg_count = 0
for word, tstat, pval in pool:
    if pval < 1e-100:
        seeds.append((word, tstat))
        if tstat > 0:
            pos_count += 1
        else:
            neg_count += 1

In [37]:
sorted_seeds = sorted(seeds, key=lambda x: abs(x[1]), reverse=True)

In [38]:
num_ = min(pos_count, neg_count)
pos_ = 0
neg_ = 0
res_pos = []
res_neg = []
for seed, pol in sorted_seeds:
    if pol > 0 and pos_ < num_:
        res_pos.append(seed)
        pos_ += 1
    if pol < 0 and neg_ < num_:
        res_neg.append(seed)
        neg_ += 1
    

In [39]:
res_pos

['love',
 'comfortable',
 'perfect',
 'highly',
 'great',
 'perfectly',
 'compliments',
 'warm',
 'day',
 'loves',
 'recommend',
 'glad',
 'amazing',
 'best',
 'absolutely',
 'pair',
 'every',
 'happy',
 'stylish']

In [40]:
res_neg

['return',
 'returned',
 'stars',
 'returning',
 'disappointed',
 'back',
 'however',
 'unfortunately',
 'boot',
 'disappointing',
 'glue',
 'refund',
 'foot',
 'cheap',
 'small',
 'reason',
 'send',
 'sent',
 'uncomfortable']