In [1]:
import pandas as pd
import re

from sklearn.linear_model import LogisticRegression

In [2]:
rules = ['uninflect', 'lexical', 'drop_aux', 'dey/it', 'negative_concord', 'ass', 'null_genetive', 'null_relcl', 'negative_inversion', 'been_done', 'got', 'None']

In [3]:
df = pd.read_csv('runs/20/everything/bst_cornell_movie.csv')

In [4]:
def get_rules(txt_html, rule):
    if re.search(f"<a href='{rule}'", txt_html) is not None:
        return 1
    return 0

In [11]:
df.iloc[0]['history_html']

"She<a href='negative_concord' title='1'><mark>'s</mark></a> <a href='negative_concord' title='2'><mark>not</mark></a>a...<|endoftext|>Lesbian?  <a href='None' title='3'><mark>No</mark></a>. I found a picture of Jared Leto in one of her drawers, so I'm pretty sure she's not harboring same-sex tendencies.<|endoftext|>So that's the kind of guy she <a href='uninflect' title='4'><mark>likes</mark></a>? Pretty ones?<|endoftext|>"

In [9]:
for rule in rules:
    df[rule] = df['history_html'].apply(lambda x : get_rules(x, rule))

In [10]:
def count_aave_features(s):
    return len(re.findall(r'<a href=.*?<mark>', s))

In [11]:
def create_is_changed_label(lbl1, lbl2):
    if lbl1 == lbl2:
        return 1
    if lbl2 == 'NEGATIVE':
        return 0
    if lbl2 == 'POSITIVE':
        return 2

In [12]:
def create_is_more_neg_label(lbl1, lbl2):
    # more negative: 1
    if lbl1 - lbl2 >= 1:
        return 1
    # no change: 0
    if lbl1 == lbl2:
        return 0
    # more positive:
    return 2

In [13]:
df['num_aave_fts'] = df['history_html'].apply(count_aave_features)

In [14]:
df['is_changed_sae'] = df.apply(lambda x : create_is_changed_label(x['roberta_large_groundtruth'], x['roberta_large_sae_gen']), axis=1)
df['is_changed_aave'] = df.apply(lambda x : create_is_changed_label(x['roberta_large_groundtruth_aave'], x['roberta_large_aave_gen']), axis=1)
df['is_more_neg'] = df.apply(lambda x : create_is_more_neg_label(x['is_changed_sae'], x['is_changed_aave']), axis=1)

In [27]:
df[df['null_relcl'] == 1].iloc[0]['history_html']

'May I speak freely?<|endoftext|>You show no inclination to speak otherwise!<|endoftext|>I know what I see.  I see someone <a href=\'null_relcl\' title=\'1\'><mark>who</mark></a>doesn\'t accept <a href=\'None\' title=\'2\'><mark>the</mark></a><a href=\'negative_concord\' title=\'3\'><mark>world</mark></a> as it is.  Who<a href=\'negative_concord\' title=\'4\'><mark>\'s</mark></a> <a href=\'negative_concord\' title=\'5\'><mark>not</mark></a>afraid.  I see a women <a href=\'null_relcl\' title=\'6\'><mark>who</mark></a><a href=\'uninflect\' title=\'7\'><mark>thinks</mark></a>... "What if?"...<|endoftext|>'

In [15]:
df.iloc[5033]['history_aave']

'Who are you?</s><s>We done hung out last night, remember?</s><s>I also remember you drove your car here. Who you really? Make no mistake, I gon shoot you and not feel bad about it. Who are you working for?</s>'

In [16]:
df['is_more_neg']

0       0
1       0
2       0
3       0
4       0
       ..
5029    0
5030    2
5031    2
5032    0
5033    1
Name: is_more_neg, Length: 5034, dtype: int64

In [17]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

X = df[rules + ['num_aave_fts']]
y = df['is_more_neg']

clf = model.fit(X, y)

In [54]:
model.score(X, y)

6.404449741756059e-01

In [56]:
print(rules + ['num_aave_fts'])
print(clf.coef_)

['uninflect', 'lexical', 'drop_aux', 'dey/it', 'negative_concord', 'ass', 'null_genetive', 'null_relcl', 'negative_inversion', 'been_done', 'got', 'None', 'num_aave_fts']
[[-7.82207010e-02  8.68375177e-02 -4.98234782e-02 -7.10888135e-02
  -1.00199689e-01  0.00000000e+00 -3.41815208e-02  2.87920868e-02
   1.79519759e-01 -1.32668992e-01 -9.74710279e-02  1.40577530e-02
   1.70785931e-02]
 [ 3.31740458e-02  3.78066288e-02 -8.90447652e-02  1.19428028e-01
  -6.73774186e-02  0.00000000e+00  2.39226312e-02  5.90175019e-02
  -8.04226312e-01  1.00284378e-01 -2.89830119e-02  1.72530991e-01
  -1.71294409e-02]
 [ 4.50466552e-02 -1.24644147e-01  1.38868243e-01 -4.83392149e-02
   1.67577108e-01  0.00000000e+00  1.02588896e-02 -8.78095887e-02
   6.24706553e-01  3.23846143e-02  1.26454040e-01 -1.86588744e-01
   5.08478416e-05]]


In [None]:
clf.intercept_

array([ 0.97175135, -0.41508819, -0.55666316])