In [1]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold, train_test_split
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt 
import matplotlib.cm as cm 
import seaborn as sns 

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer, word_tokenize

from collections import defaultdict

from lazypredict.Supervised import LazyClassifier

import imblearn

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import flair 

from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis, filter_unlabeled_dataframe
from snorkel.analysis import get_label_buckets
from snorkel.preprocess import preprocessor 
from snorkel.preprocess.nlp import SpacyPreprocessor
from snorkel.labeling.model import MajorityLabelVoter, LabelModel
from snorkel.utils import probs_to_preds



In [2]:
df = pd.read_csv('../data/processed/tanishq_data_clean_labelled.csv')
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], inplace=True, axis=1)
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()
df['clean_tweet_token'] = df['clean_tweet_segmented'].apply(lemmatizer.lemmatize).apply(tokenizer.tokenize)
df['clean_tweet_token'] = df['clean_tweet_token'].str.join(' ')
df = df[['clean_tweet_token', 'sentiment']]


In [3]:
df_ground_truth = df[df['sentiment'].isin([0.0, 4.0])]
df_ground_truth['sentiment'] = df_ground_truth['sentiment'].replace({4.0: 1, 0.0: -1})
df_ground_truth_generator, df_ground_truth_discriminator = train_test_split(df_ground_truth, test_size=0.25, stratify=df_ground_truth['sentiment'])

In [4]:
df_unlabelled = df[df['sentiment'].isin([np.nan, 10.0])]
df_unlabelled.drop('sentiment', axis=1, inplace=True)
del df 

In [5]:
# Labeling function

# Textblob 
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.clean_tweet_token)
    x.tb_polarity = scores.sentiment.polarity
    x.tb_subjectivity = scores.sentiment.subjectivity
    return x 

@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return 1 if x.tb_polarity > 0 else -1

# Vader
@preprocessor(memoize=True)
def vader_sentiment(x):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(x.clean_tweet_token)
    x.vd_polarity = scores['compound']
    return x 

@labeling_function(pre=[vader_sentiment])
def vader_polarity(x):
    return 1 if x.vd_polarity > 0 else -1

# Flair
flair_sent = flair.models.TextClassifier.load('en-sentiment')
@preprocessor(memoize=True)
def flair_sentiment(x):
    s = flair.data.Sentence(x.clean_tweet_token)
    flair_sent.predict(s)
    x.fl_polarity =  s.get_label_names()[0].lower()
    return x 

@labeling_function(pre=[flair_sentiment])
def flair_polarity(x):
    return 1 if x.fl_polarity == 'positive' else -1



2021-02-05 00:03:30,580 loading file /Users/mamu867/.flair/models/sentiment-en-mix-distillbert_3.1.pt


In [6]:
lfs = [textblob_polarity, vader_polarity, flair_polarity]

applier = PandasLFApplier(lfs)
L_train = applier.apply(df_unlabelled.sample(3000, random_state=42))

100%|██████████| 3000/3000 [01:34<00:00, 31.58it/s]


In [7]:
LFAnalysis(L_train, lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
textblob_polarity,0,[1],0.17,0.09,0.0
vader_polarity,1,[1],0.18,0.1,0.0
flair_polarity,2,[1],0.03,0.02,0.0


In [8]:
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

In [9]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123)

In [12]:
L_test = applier.apply(df=df_ground_truth_generator)

majority_accuracy = majority_model.score(L=L_test, Y=df_ground_truth_generator['sentiment'], tie_break_policy="random", metrics=['accuracy'])['accuracy']

print(f"\n{'Majority Vote Accuracy:':<25} {majority_accuracy * 100:0.1f}%")

label_model_accuracy = label_model.score(L=L_test,Y=df_ground_truth_generator['sentiment'], tie_break_policy="random", metrics=['accuracy'])['accuracy']

print(f"{'Label Model Accuracy:':<25} {label_model_accuracy * 100:0.1f}%")

100%|██████████| 275/275 [00:00<00:00, 24663.41it/s]
Majority Vote Accuracy:   100.0%
Label Model Accuracy:     100.0%



In [13]:
buckets = get_label_buckets(L_train[:, 0], L_train[:, 1], L_train[:, 2])
df_unlabelled.iloc[buckets[(1, 1, 1)]].sample(10)

Unnamed: 0,clean_tweet_token
791,boycott tanishq boycott tanishq boycott tanish...
1130,boycott amazon boycott tanishq jewelry boycott...
1552,tanishq hai ki sudhar ne ka naam hi nahin let ...
1349,boycott bollywood boycott kbc boycott bollywoo...
728,boycott tanishq jewelry boycott tan is q
3148,must read and don t forget to boycott tanishq
2416,boycott tanishq trends again tanishq ad in con...
3090,boycott tanishq
1457,boycott tanishq
2003,jewellery brand has again run into a controver...


In [12]:
L_train

array([[ 1, -1, -1],
       [ 1, -1, -1],
       [-1, -1, -1],
       ...,
       [-1,  1, -1],
       [-1, -1, -1],
       [-1, -1, -1]])