In [1]:
import numpy as np 
import pandas as pd 
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_recall_curve, average_precision_score, f1_score, roc_auc_score, plot_precision_recall_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN 
from sklearn.ensemble import VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC, LinearSVC, NuSVC

import xgboost
import lightgbm

import matplotlib.pyplot as plt 
import matplotlib.cm as cm 
import seaborn as sns 

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer, word_tokenize

from collections import defaultdict

from lazypredict.Supervised import LazyClassifier

import imblearn

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import flair 

from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis, filter_unlabeled_dataframe
from snorkel.analysis import get_label_buckets
from snorkel.preprocess import preprocessor 
from snorkel.preprocess.nlp import SpacyPreprocessor
from snorkel.labeling.model import MajorityLabelVoter, LabelModel
from snorkel.utils import probs_to_preds



### Steps we will perform in this exercise:

1. Label a small number of samples (df_ground_truth)
2. Use weak supervision (Generator) to label the unlabelled samples (df_unlabelled)
3. Use a supervised model based on ground truth labels and weak supervised labels to classify the sentiment (Discriminator)

## Weak supervision

To generate pseudolabel, we have 4 common types of labeling functions:

1. Hard-coded heuristics (in our problem, boycott tanishq is an obvious giveaway; however, (reject/don't support) boycott tanishq may be an indicaor of an opposite sentiment). We will try to label based on presence of this phrases in the tweet; otherwise we will abstain from labeling the tweet.
2. Syntactics: Spacy's dependency trees can be a very good starting point for generating some more labels.
3. Distant supervision: textblob, flair, vader sentiment analyzer.
4. External models: other models that can generate some good labels.

In [2]:
df = pd.read_csv('../data/processed/tanishq_data_clean_labelled.csv')
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], inplace=True, axis=1)
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()
df['clean_tweet_token'] = df['clean_tweet_segmented'].apply(lemmatizer.lemmatize).apply(tokenizer.tokenize)
df['clean_tweet_token'] = df['clean_tweet_token'].str.join(' ')
df = df[['clean_tweet_token', 'sentiment']]


In [3]:
df_ground_truth = df[df['sentiment'].isin([0.0, 4.0])]
df_ground_truth['sentiment'] = df_ground_truth['sentiment'].replace({4.0: 1, 0.0: -1})
df_ground_truth_generator, df_ground_truth_discriminator = train_test_split(df_ground_truth, test_size=0.25, stratify=df_ground_truth['sentiment'])

In [4]:
df_unlabelled = df[df['sentiment'].isin([np.nan, 10.0])]
df_unlabelled.drop('sentiment', axis=1, inplace=True)
del df 

In [5]:
#pd.set_option('display.max_colwidth', 0)
#df_ground_truth[df_ground_truth['sentiment'] == -1.00]

In [6]:
# Labeling function

# Textblob 
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.clean_tweet_token)
    x.tb_polarity = scores.sentiment.polarity
    x.tb_subjectivity = scores.sentiment.subjectivity
    return x 

@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return 1 if x.tb_polarity > 0 else -1

# Vader
@preprocessor(memoize=True)
def vader_sentiment(x):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(x.clean_tweet_token)
    x.vd_polarity = scores['compound']
    return x 

@labeling_function(pre=[vader_sentiment])
def vader_polarity(x):
    return 1 if x.vd_polarity > 0 else -1

# Flair
flair_sent = flair.models.TextClassifier.load('en-sentiment')
@preprocessor(memoize=True)
def flair_sentiment(x):
    s = flair.data.Sentence(x.clean_tweet_token)
    flair_sent.predict(s)
    x.fl_polarity =  s.get_label_names()[0].lower()
    return x 

@labeling_function(pre=[flair_sentiment])
def flair_polarity(x):
    return 1 if x.fl_polarity == 'positive' else -1

# Positive sentiments about Tanishq
search = r"(support tanishq | ek at vam)"

@labeling_function()
def positive_tanishq(x):
    return 1 if re.search(search, x.clean_tweet_token, flags=re.I) else 0

# Negative sentiments about Tanishq
search = r"(boycott tanishq | boycott bollywood | boycott amazon | hindu | offended | local | not respect | tradition | hindus | teach | losing | trust)"

@labeling_function()
def negative_tanishq(x):
    return -1 if re.search(search, x.clean_tweet_token, flags=re.I) else 0




2021-02-18 16:16:32,511 loading file /Users/mamu867/.flair/models/sentiment-en-mix-distillbert_3.1.pt


In [7]:
df_ground_truth_generator

Unnamed: 0,clean_tweet_token,sentiment
652,kick out your ad film makers and directors or ...,-1.00
1212,why always sanatan culture on target and get d...,-1.00
598,time to boycott tanishq again boycott tanishq ...,-1.00
2195,so this dhanteras stop buying jewellery frm ta...,-1.00
1437,waiting for an ad by on bakr i d where the act...,-1.00
...,...,...
2150,boycott tanishq you are losing all your trust ...,-1.00
432,can you please stop this nonsense by one of yo...,-1.00
1112,anti hindu tanishq marketing their products us...,-1.00
266,you don t hold any responsibility sir for hurt...,-1.00


In [8]:
count_vec = CountVectorizer()
X_count_vec_gen = count_vec.fit_transform(df_ground_truth_generator['clean_tweet_token'])
X_count_vec_dis = count_vec.transform(df_ground_truth_discriminator['clean_tweet_token'])
y_gen = df_ground_truth_generator['sentiment'].values
y_dis = df_ground_truth_discriminator['sentiment'].values

In [9]:
lr = LogisticRegression(random_state=42)
knn = KNN()
dt = DecisionTreeClassifier(random_state=42)
xgb = xgboost.XGBClassifier()
lgb = lightgbm.LGBMClassifier(random_state=42)
lda = LinearDiscriminantAnalysis()
svc = SVC(probability=True, random_state=42)
lin_svc = LinearSVC(random_state=42)

classifiers = {'lr': lr, 'knn': knn, 'dt': dt,  'lda': lda, 'svc': svc, 'xgb': xgb, 'lgb': lgb, 'lin_svc': lin_svc}

for clf in classifiers:
    classifiers[clf].fit(X_count_vec_gen.toarray(), y_gen)

@labeling_function()
def lr_label(x):
    return classifiers['lr'].predict(count_vec.transform(x).toarray())

@labeling_function()
def knn_label(x):
    return classifiers['knn'].predict(count_vec.transform(x).toarray())

@labeling_function()
def dt_label(x):
    return classifiers['dt'].predict(count_vec.transform(x).toarray())

@labeling_function()
def xgb_label(x):
    return classifiers['xgb'].predict(count_vec.transform(x).toarray())

@labeling_function()
def lgb_label(x):
    return classifiers['lgb'].predict(count_vec.transform(x).toarray())

@labeling_function()
def lda_label(x):
    return classifiers['lda'].predict(count_vec.transform(x).toarray())

@labeling_function()
def svc_label(x):
    return classifiers['svc'].predict(count_vec.transform(x).toarray())

@labeling_function()
def lin_svc_label(x):
    return classifiers['lin_svc'].predict(count_vec.transform(x).toarray())


In [10]:
lfs = [textblob_polarity, vader_polarity, flair_polarity, positive_tanishq, negative_tanishq, lr_label, knn_label, dt_label, xgb_label, lgb_label, lda_label, svc_label, lin_svc_label]

applier = PandasLFApplier(lfs)
df_unlabelled = df_unlabelled.sample(1500, random_state=42)
L_train = applier.apply(df_unlabelled)

100%|██████████| 1500/1500 [01:11<00:00, 21.06it/s]


In [11]:
LFAnalysis(L_train, lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
textblob_polarity,0,[1],0.16,0.16,0.04
vader_polarity,1,[1],0.18,0.18,0.05
flair_polarity,2,[1],0.02,0.02,0.01
positive_tanishq,3,"[0, 1]",1.0,0.83,0.43
negative_tanishq,4,[0],0.48,0.48,0.43
lr_label,5,[1],0.0,0.0,0.0
knn_label,6,[],0.0,0.0,0.0
dt_label,7,[1],0.01,0.01,0.01
xgb_label,8,[1],0.0,0.0,0.0
lgb_label,9,[1],0.01,0.01,0.0


In [12]:
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

In [13]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123)

In [14]:
L_test = applier.apply(df=df_ground_truth_generator[['clean_tweet_token']])

majority_accuracy = majority_model.score(L=L_test, Y=df_ground_truth_generator['sentiment'], tie_break_policy="random", metrics=['accuracy'])['accuracy']

print(f"\n{'Majority Vote Accuracy:':<25} {majority_accuracy * 100:0.1f}%")

label_model_accuracy = label_model.score(L=L_test,Y=df_ground_truth_generator['sentiment'], tie_break_policy="random", metrics=['accuracy'])['accuracy']

print(f"{'Label Model Accuracy:':<25} {label_model_accuracy * 100:0.1f}%")

100%|██████████| 275/275 [00:20<00:00, 13.13it/s]
Majority Vote Accuracy:   100.0%
Label Model Accuracy:     69.2%



In [15]:
buckets = get_label_buckets(L_train[:, 0], L_train[:, 1], L_train[:, 2])
df_unlabelled.iloc[buckets[(1, 1, 1)]].sample(10)

Unnamed: 0,clean_tweet_token
4833,from next year every hindu shud advice those p...
5516,do not waste you energy on anti hindus anti hi...
1755,celebrate diwali with this great bomb and give...
2726,the most popular hobbies around the world we w...
3805,so true let s boycott tanishq
4969,tanishq is brand of tata co amp i really respe...
8011,enjoy diwali with crackers it s hindu indian f...
6619,hindus please burst as many crackers as possib...
2173,there is no crackers ban in haryana hours up m...
7082,hey tanishq pr team yesterday we did gold shop...


In [16]:
y_ground_truth_pred = majority_model.predict(L_test)
print(classification_report(df_ground_truth_generator['sentiment'].values, y_ground_truth_pred))

              precision    recall  f1-score   support

        -1.0       1.00      0.02      0.04       262
         0.0       0.00      0.00      0.00         0
         1.0       0.05      1.00      0.10        13

    accuracy                           0.07       275
   macro avg       0.35      0.34      0.05       275
weighted avg       0.96      0.07      0.04       275



In [35]:
X_unlabelled = count_vec.transform(df_unlabelled['clean_tweet_token']).toarray()[preds_train != 0]
y_unlabelled = preds_train[preds_train != 0]
y_unlabelled[y_unlabelled == -1] = 0
X_train = np.concatenate([X_count_vec_gen.toarray(), X_unlabelled])
y_train = np.concatenate([y_gen, y_unlabelled])

In [40]:
clf_cv = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_cv, predictions_cv = clf_cv.fit(X_train, X_count_vec_dis.toarray(), y_train, y_dis)

100%|██████████| 30/30 [00:52<00:00,  1.76s/it]


In [41]:
models_cv

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GaussianNB,0.34,0.65,0.66,0.44,0.13
SVC,0.22,0.59,0.59,0.28,2.15
DecisionTreeClassifier,0.39,0.58,0.59,0.51,0.18
RandomForestClassifier,0.2,0.57,0.57,0.25,0.63
BernoulliNB,0.35,0.56,0.56,0.46,0.09
SGDClassifier,0.32,0.54,0.57,0.43,0.4
CalibratedClassifierCV,0.1,0.52,0.52,0.09,30.38
CheckingClassifier,0.95,0.5,0.5,0.92,0.08
ExtraTreesClassifier,0.23,0.5,0.52,0.31,0.99
NearestCentroid,0.39,0.49,0.56,0.52,0.1
