In [1]:
import numpy as np
import pandas as pd
import os
import tokenizers
import string
import torch
import transformers
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
import re

from sklearn.model_selection import train_test_split

In [3]:
MAX_LEN = 192
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 5
ROBERTA_PATH = "../roberta-base"
TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab_file=f"{ROBERTA_PATH}/vocab.json", 
    merges_file=f"{ROBERTA_PATH}/merges.txt", 
    lowercase=True,
    add_prefix_space=True
)

In [4]:
class TweetModel(transformers.RobertaModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.roberta = transformers.RobertaModel.from_pretrained(ROBERTA_PATH, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [195]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }


class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

In [13]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    if idx_end < idx_start:
            idx_end = idx_start
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "
# ====
    filtered_output = filtered_output.replace(" .", ".")
    filtered_output = filtered_output.replace(" ?", "?")
    filtered_output = filtered_output.replace(" !", "!")
    filtered_output = filtered_output.replace(" ,", ",")
    filtered_output = filtered_output.replace(" ' ", "'")
    filtered_output = filtered_output.replace(" n't", "n't")
    filtered_output = filtered_output.replace(" 'm", "'m")
    filtered_output = filtered_output.replace(" do not", " don't")
    filtered_output = filtered_output.replace(" 's", "'s")
    filtered_output = filtered_output.replace(" 've", "'ve")
    filtered_output = filtered_output.replace(" 're", "'re")
# ====
    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    
#     if idx_end < idx_start:
#         idx_end = idx_start
    
#     filtered_output  = ""
#     for ix in range(idx_start, idx_end + 1):
#         filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
#         if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
#             filtered_output += " "

#     if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
#         filtered_output = original_tweet

#     if sentiment_val != "neutral" and verbose == True:
#         if filtered_output.strip().lower() != target_string.strip().lower():
#             print("********************************")
#             print(f"Output= {filtered_output.strip()}")
#             print(f"Target= {target_string.strip()}")
#             print(f"Tweet= {original_tweet.strip()}")
#             print("********************************")

#     jac = 0
    jac = jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output

In [7]:
df_test = pd.read_csv("input/train.csv")
df_test.loc[:, "selected_text"] = df_test.text.values

In [85]:
df_test

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****, why couldn`t they put them on t...",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,wish we could come see u on Denver husband l...,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,I`ve wondered about rake to. The client has ...,negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you. Enjoy the break - y...,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [8]:
device = torch.device("cuda")
model_config = transformers.RobertaConfig.from_pretrained(ROBERTA_PATH)
model_config.output_hidden_states = True

In [9]:
# import gc
# gc.collect()
# torch.cuda.empty_cache()

In [10]:
model_path = 'src/bins_1104/'

model1 = TweetModel(conf=model_config)
model1.to(device)
model1.load_state_dict(torch.load(model_path+"model_0.bin"))
model1.eval()

model2 = TweetModel(conf=model_config)
model2.to(device)
model2.load_state_dict(torch.load(model_path+"model_1.bin"))
model2.eval()

model3 = TweetModel(conf=model_config)
model3.to(device)
model3.load_state_dict(torch.load(model_path+"model_2.bin"))
model3.eval()

model4 = TweetModel(conf=model_config)
model4.to(device)
model4.load_state_dict(torch.load(model_path+"model_3.bin"))
model4.eval()

model5 = TweetModel(conf=model_config)
model5.to(device)
model5.load_state_dict(torch.load(model_path+"model_4.bin"))
model5.eval()

TweetModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplac

In [11]:
final_output = []
jaccard_scores = []

In [14]:
test_dataset = TweetDataset(
        tweet=df_test.text.values,
        sentiment=df_test.sentiment.values,
        selected_text=df_test.selected_text.values
    )

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=VALID_BATCH_SIZE,
    num_workers=1
)


with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy()

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        outputs_start1, outputs_end1 = model1(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start2, outputs_end2 = model2(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start3, outputs_end3 = model3(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start4, outputs_end4 = model4(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start5, outputs_end5 = model5(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        outputs_start = (outputs_start1 + outputs_start2 + outputs_start3 + outputs_start4 + outputs_start5) / 5
        outputs_end = (outputs_end1 + outputs_end2 + outputs_end3 + outputs_end4 + outputs_end5) / 5
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
#         jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            js, output_sentence = calculate_jaccard_score( #output_sentence
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            final_output.append(output_sentence)
            jaccard_scores.append(js)


  0%|          | 0/3436 [00:00<?, ?it/s][A
  0%|          | 1/3436 [00:00<19:08,  2.99it/s][A
  0%|          | 2/3436 [00:00<17:59,  3.18it/s][A
  0%|          | 3/3436 [00:00<17:01,  3.36it/s][A
  0%|          | 4/3436 [00:01<16:20,  3.50it/s][A
  0%|          | 5/3436 [00:01<15:53,  3.60it/s][A
  0%|          | 6/3436 [00:01<15:35,  3.67it/s][A
  0%|          | 7/3436 [00:01<15:24,  3.71it/s][A
  0%|          | 8/3436 [00:02<15:15,  3.74it/s][A
  0%|          | 9/3436 [00:02<15:09,  3.77it/s][A
  0%|          | 10/3436 [00:02<15:05,  3.78it/s][A
  0%|          | 11/3436 [00:02<15:02,  3.80it/s][A
  0%|          | 12/3436 [00:03<15:01,  3.80it/s][A
  0%|          | 13/3436 [00:03<14:59,  3.81it/s][A
  0%|          | 14/3436 [00:03<14:45,  3.87it/s][A
  0%|          | 15/3436 [00:03<14:36,  3.90it/s][A
  0%|          | 16/3436 [00:04<14:29,  3.93it/s][A
  0%|          | 17/3436 [00:04<14:23,  3.96it/s][A
  1%|          | 18/3436 [00:04<14:19,  3.98it/s][A
  1%|     

In [15]:
sample = pd.read_csv("input/train.csv")
sample.loc[:, 'selected_text_model'] = final_output
sample.loc[:, 'jaccard'] = jaccard_scores
sample.to_csv("src/bins_1104/train_result_jaccard_filtered.csv", index=False)

In [3]:
tmp = pd.read_csv("src/roberta_base_1104/train_result_jaccard.csv")
# tmp_f = pd.read_csv("src/bins_1104/train_result_jaccard_filtered.csv")

In [4]:
tmp.head(5)

Unnamed: 0,textID,text,selected_text,sentiment,selected_text_model,jaccard
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"I`d have responded, if I were going",1.0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,Sooo SAD,0.2
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying,0.2
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone,0.6
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"Sons of ****,",0.214286


In [25]:
tmp.loc[tmp[(tmp.jaccard < 0.8)].index, ['selected_text', 'selected_text_model', 'sentiment']].sentiment.value_counts()

positive    7792
negative    6932
Name: sentiment, dtype: int64

In [44]:
wrong_neutral={}
for i, j in tmp[tmp.sentiment == 'neutral'].loc[:, ['text', 'selected_text']].iterrows():
    if len(str(j[0]).split()) != len(str(j[1]).split()):
        wrong_neutral[i] = len(str(j[0]).split()) - len(str(j[1]).split())

In [74]:
tmp['cnt_wrong_neutral'] = tmp.index.map(wrong_neutral)

In [87]:
df_wrong_neutral = tmp.dropna().copy()

In [125]:
for i in df_wrong_neutral.index:
    res = (set(df_wrong_neutral.loc[i, 'selected_text_model'].split()) - set(df_wrong_neutral.loc[i, 'selected_text'].split()))
#     print(', '.join(list(res)))
    df_wrong_neutral.loc[i, 'wrong_neutral_words'] = ', '.join(list(res))

In [158]:
tmp[tmp.cnt_wrong_neutral.isna() & (tmp.sentiment=='neutral')]['selected_text'].str.match('x').sum()

1

In [160]:
from collections import Counter # 87 http

c=Counter()
for word in df_wrong_neutral.wrong_neutral_words.tolist():
     c[word] += 1

c.most_common(20) 

[('x', 19),
 ('?', 13),
 ('', 9),
 ('****', 8),
 (':', 8),
 ('lol', 7),
 ('-', 4),
 ('!', 4),
 ('<3', 4),
 ('http://tweet.sg, -', 3),
 (';)', 3),
 ('_carter', 3),
 (':/', 3),
 ('it', 2),
 ('_s', 2),
 ('_Lia', 2),
 ('#fb', 2),
 ('??', 2),
 ('_1210', 2),
 ('http://bit.ly/AF8JT', 2)]

In [65]:
tmp.groupby('sentiment')['jaccard'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
negative,7781.0,0.995939,0.055909,0.0,1.0,1.0,1.0,1.0
neutral,11118.0,0.999795,0.007239,0.571429,1.0,1.0,1.0,1.0
positive,8582.0,0.995801,0.05708,0.0,1.0,1.0,1.0,1.0


In [8]:
import sys
sys.path.insert(0, "/home/pchlq/workspace/models_nlp/sentencepiece-pb2.py")

In [9]:
import os
import sentencepiece as spm
import sentencepiece_pb2

In [4]:
sp = spm.SentencePieceProcessor()
model_path = "/home/pchlq/workspace/models_nlp/albert-xxlarge-v1/"
sp.load(os.path.join(model_path, "spiece.model"))

True

In [10]:
class SentencePieceTokenizer:
    def __init__(self, model_path):
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(os.path.join(model_path, "spiece.model"))
    
    def encode(self, sentence):
        spt = sentencepiece_pb2.SentencePieceText()
        spt.ParseFromString(self.sp.encode_as_serialized_proto(sentence))
        offsets = []
        tokens = []
        for piece in spt.pieces:
            tokens.append(piece.id)
            offsets.append((piece.begin, piece.end))
        return tokens, offsets

In [11]:
spt = SentencePieceTokenizer(model_path)

In [17]:
spt.encode("neutral")[0][0]

8387

# score analysis

In [18]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [104]:
for i, j in tmp.iterrows():
    try: 
        tmp.loc[i, 'jaccard'] = (jaccard(j['selected_text'], j['selected_text_model']))
    except:
        tmp.loc[i, 'jaccard'] = np.nan
        print(i)

314


In [109]:
tmp.iloc[314, :]

textID                 fdb77c3752
text                          NaN
selected_text                 NaN
sentiment                 neutral
selected_text_model           nan
jaccard                       NaN
Name: 314, dtype: object

In [105]:
tmp.head()

Unnamed: 0,textID,text,selected_text,sentiment,selected_text_model,jaccard
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"I`d have responded, if I were going",1.0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,Sooo SAD,1.0
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying,0.5
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone,1.0
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"Sons of ****,",1.0


In [107]:
tmp.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [106]:
tmp.groupby('sentiment')['jaccard'].mean()

sentiment
negative    0.612771
neutral     0.976445
positive    0.599159
Name: jaccard, dtype: float64

# neutral

In [117]:
tmp.loc[tmp.sentiment=='neutral', ['text', 'selected_text', 'selected_text_model', 'jaccard']].sort_values(by='jaccard').head(20)

Unnamed: 0,text,selected_text,selected_text_model,jaccard
18142,wow.???? ??????,wow.,wow.???? ??????,0.0
24069,G`night!,G`night,G`night!,0.0
26230,check out review for the movie Fighting - htt...,Hilarious,check out review for the movie Fighting - htt...,0.0
13965,hï¿½rlich!,rlich!,hï¿½rlich!,0.0
25691,@_Cheshire_Cat_,_Cheshire_Cat_,@_Cheshire_Cat_,0.0
3263,_INTHEMAKING :aw,aw,_INTHEMAKING :aw,0.0
24210,BYEEEEE!!!,BYEEEEE!!,BYEEEEE!!!,0.0
12565,Is it bad that I`m sitting here watching a #P...,bad,Is it bad that I`m sitting here watching a #P...,0.045455
22458,"first up, make up for lost time with jelly. Ja...",Happeh,"first up, make up for lost time with jelly. J...",0.05
22815,I am the queen of losing things. Important thi...,losing,I am the queen of losing things. Important th...,0.071429


In [169]:
neutral = tmp.loc[tmp.sentiment=='neutral'].dropna().copy()

In [193]:
# regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
regex = re.compile('[hï¿½]')
neutral[neutral.text.str.contains(regex)].sort_values(by='jaccard')[:20]

Unnamed: 0,textID,text,selected_text,sentiment,selected_text_model,jaccard
26230,01cf51125c,check out review for the movie Fighting - htt...,Hilarious,neutral,check out review for the movie Fighting - htt...,0.0
24069,bd085c659b,G`night!,G`night,neutral,G`night!,0.0
13965,9d57d8f6d6,hï¿½rlich!,rlich!,neutral,hï¿½rlich!,0.0
25691,f7286fdad8,@_Cheshire_Cat_,_Cheshire_Cat_,neutral,@_Cheshire_Cat_,0.0
12565,5d80bf1e55,Is it bad that I`m sitting here watching a #P...,bad,neutral,Is it bad that I`m sitting here watching a #P...,0.045455
22458,a99c5a9003,"first up, make up for lost time with jelly. Ja...",Happeh,neutral,"first up, make up for lost time with jelly. J...",0.05
4141,f1d8f49520,"Morning John, yes I do, however I have a diar...",sadly,neutral,"Morning John, yes I do, however I have a diar...",0.071429
22815,f782648201,I am the queen of losing things. Important thi...,losing,neutral,I am the queen of losing things. Important th...,0.071429
9907,fc3b643847,"Beer, garden, IPOD, Friday night, shame I`ve g...",shame,neutral,"Beer, garden, IPOD, Friday night, shame I`ve ...",0.090909
5053,cc443795be,Sick Right now very thankful for my chicken s...,Sick,neutral,Sick Right now very thankful for my chicken s...,0.090909


In [194]:
tmp.loc[27471, 'text']

'i`m defying gravity. and nobody in alll of oz, no wizard that there is or was, is ever gonna bring me down'

In [140]:
neutral.loc[neutral.text.str.match(r'\w+\?').index]

Unnamed: 0,textID,text,selected_text,sentiment,selected_text_model,jaccard
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"I`d have responded, if I were going",1.000000
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,http://www.dothebouncy.com/smf - some shamele...,1.000000
7,50e14c0bb8,Soooo high,Soooo high,neutral,Soooo high,1.000000
8,e050245fbd,Both of you,Both of you,neutral,Both of you,1.000000
10,2339a9b08b,"as much as i love to be hopeful, i reckon the...","as much as i love to be hopeful, i reckon the ...",neutral,"as much as i love to be hopeful, i reckon the...",1.000000
...,...,...,...,...,...,...
27468,a753a93e45,"few grilled mushrooms and olives, feta cheese ...","few grilled mushrooms and olives, feta cheese ...",neutral,"few grilled mushrooms and olives, feta cheese...",1.000000
27469,ac92790d8b,94 more days till BH comes back to LA,94 more days till BH comes back to LA,neutral,94 more days till BH comes back to LA,1.000000
27471,15bb120f57,"i`m defying gravity. and nobody in alll of oz,...","i`m defying gravity. and nobody in alll of oz,...",neutral,i`m defying gravity. and nobody in alll of oz...,1.000000
27473,a208770a32,in spoke to you yesterday and u didnt respond...,in spoke to you yesterday and u didnt respond ...,neutral,in spoke to you yesterday and u didnt respond...,1.000000


# negative

In [187]:
negative = tmp.loc[tmp.sentiment=='negative'].dropna().copy()

In [192]:
negative.loc[(tmp.jaccard < .5), ['text', 'selected_text', 'selected_text_model', 'jaccard']].sort_values(by='jaccard')[20:50] # jac_1=722

Unnamed: 0,text,selected_text,selected_text_model,jaccard
14061,lol what bothers me is that i`m messing with ...,i`m messing with my metabolism,bothers,0.0
20771,i`m so sick bad throat and the WORST toothac...,e WORST,i`m so sick,0.0
11248,Is getting upset at work cus bindz and j are b...,they won`t let me go,upset,0.0
20611,WHAT THE HELL IS GOING ON?!?! Last night and t...,this morning SUCKED..,SUCKED...,0.0
20614,hate this **** cold. i can`t stop sneezing. an...,****,hate,0.0
24502,Drinking and smoking is very bad.---but im gro...,is very bad.-,bad.,0.0
24490,http://www.CultureShockMag.com shoutz 2 the...,na b nervvoouus,nervvoouuss,0.0
14002,doing my english essay (on r&j...wtf) that i s...,wtf),...wtf),0.0
24418,gahh!! i`m so tired right now.,gahh!!,tired,0.0
20677,*sigh* Off 2 bed 2 try 2 get these crummy 2 hr...,bite..,crummy,0.0
