In [1]:
import os 
import glob 

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt 
import seaborn as sns 


## Directories 

In [None]:
data_dir = "../data"

## Read files 

In [None]:
filename = "April_NoMask_Tweets.csv"


df = pd.read_csv(os.path.join(data_dir, "NoMask_Tweets_Version_2", filename))

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.columns

In [None]:
df_eng = df[df["Language"] == "en"]

In [None]:

df_eng.shape

## BERT Features 

In [None]:
import feature_extraction_via_BERTTweet

BERT_feature2D = feature_extraction_via_BERTTweet.get_features(df_eng, 0)

In [None]:
BERT_feature2D.shape

In [None]:
def wrt_single_file(filename, wrt_dir):
    df = pd.read_csv(filename)
    latent_val_df = feature_extraction_via_BERTTweet.get_features(df, 0)

    # write file to disk  
    latent_filename = filename.split("/")[-1]
    latent_val_df.to_csv(os.path.join(wrt_dir, latent_filename), index=False, header=True)

    return 0 

In [None]:
filename = "April_NoMask_Tweets.csv"
wrt_dir = "../data/BERTTweet_Features"

wrt_single_file(os.path.join("../data/nomask_tweets_v2_eng", filename), wrt_dir)

In [None]:
filename = "August_NoMask_Tweets.csv"
wrt_dir = "../data/BERTTweet_Features"

wrt_single_file(os.path.join("../data/nomask_tweets_v2_eng", filename), wrt_dir)

## VADER for Text Polarity Identity

In [None]:
import nltk

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
# sentiment_model = SentimentIntensityAnalyzer()

# a = 'This was a good movie.'
# x = sentiment_model.polarity_scores(a)


import nltk

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer


def get_sentiment(df):
    sentiment_model = SentimentIntensityAnalyzer()
    df['sentimentDict'] = df['Tweet Text'].apply(lambda tweet: sentiment_model.polarity_scores(tweet))
    
    df['vader_neg']  = df['sentimentDict'].apply(lambda score_dict: score_dict['neg'])
    df['vader_neu']  = df['sentimentDict'].apply(lambda score_dict: score_dict['neu'])
    df['vader_pos']  = df['sentimentDict'].apply(lambda score_dict: score_dict['pos'])
    df['vader_compound']  = df['sentimentDict'].apply(lambda score_dict: score_dict['compound'])


    return df.drop(columns=["sentimentDict"])

df = pd.read_csv("../data/stack_files/samp_raw_df_april.csv")
df2 = get_sentiment(df)
df2.head()

In [None]:
scores = sentiment_model.polarity_scores(df_eng["Tweet Text"].tolist()[0])

## Fold generation for cross validation 

In [None]:
import random 

def get_folds(df):
    L = list(range(1, 5)) 
    
    q = df.shape[0] // len(L) 
    r = df.shape[0] % len(L)
   
    
    folds = L * q + L[:r]
    
    random.seed(2020)
    random.shuffle(folds)
    
    df["folds"] = folds
    
    return df


df.head()
df3 = get_folds(df)
df3.head()

# df.shape[0] % 4

In [None]:
df_eng["Tweet Text"].tolist()[0]

In [None]:
for i in range(150):
    tweet = df_eng["Tweet Text"].tolist()[i]
    score = sentiment_model.polarity_scores(tweet)
    
    print("=" * 100)
    print(tweet)
    print(score)
    print("\n")
    
    
    

In [None]:
L = []

for i in range(150):
    tweet = df_eng["Tweet Text"].tolist()[i]
    print(tweet)
    
    label = input()
    L.append(label)
    print("Label:", label)
    

## TF-IDF 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'


vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)



In [None]:
def get_TF_IDF_mat(list_of_tweets, list_of_IDs):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(list_of_tweets) # TODO - need to make it generalizable in future
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    
    TF_IDF_df = pd.DataFrame(denselist, columns=feature_names)
    TF_IDF_df["ID"] = list_of_IDs # n
    return TF_IDF_df

In [None]:
df.shape

In [None]:
df = pd.read_csv("../data/stack_files/samp_raw_df_april.csv")



In [None]:
df_TF_IDF = get_TF_IDF_mat(df["Tweet Text"], df["ID"])

In [None]:
df_TF_IDF.head()
df_TF_IDF.describe()

## LDA - Topic Modeling 

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification


lda = LatentDirichletAllocation(n_components=4, random_state=2020) # n_jobs=None

lda_model = lda.fit_transform(df_TF_IDF.iloc[:, :df_TF_IDF.shape[1] - 1])

In [None]:
lda_model

In [None]:
def latent_da_v2(X_train, X_test, n_comps):
    lda = LatentDirichletAllocation(n_components=n_comps, random_state=2020) # n_jobs=None 

    return lda.fit_transform(X_train), lda.transform(X_test)

In [None]:
X_train, X_test = df_TF_IDF.iloc[:100, :-1], df_TF_IDF.iloc[100:, :-1]

In [None]:
X_test.shape

In [None]:
X_train2, X_test2 = latent_da_v2(X_train, X_test, 20)

In [None]:
X_train2

In [None]:
X_test2

## Label some data

In [None]:
df = pd.read_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4.csv")

In [None]:
df.columns

In [None]:
import numpy as np 
import pandas as pd 


def rater():
    df = pd.read_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4.csv")
    rating_df = pd.read_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4_w_ratings.csv")
    
    df2 = df.copy()
    df2["CAP6317_rating"] = [-999] * df.shape[0]
    print(rating_df.shape)

    L = []
    leng = 20 # df.shape[0]
    rater_mat = np.zeros((0, 2))
    left, right = rating_df.shape[0], df.shape[0]
    
    right = 5
    for i in range(left, right):
        print("i:", i)
        print("\n")
        print("=" * 80)
        print("Tweet text:", df.iloc[i, 0])


        print("-" * 80)
        print('Is it an ANTI_MASK tweet? y(1) or n(0)')
        x = input()

        if str(x) == "1" or str(x).lower() == "y":
#             rating = np.array([[df["ID"][i], 1]])
            rating = [[df["ID"][i], 1]
            rating_df.append(pd.DataFrame(rating, columns=["ID", "CAP6317_rating"]), ignore_index=True)  
#             rating_df = rating_df.append(pd.DataFrame(rating, columns=["ID", "CAP6317_rating"]), ignore_index=True)
        elif str(x) == "0" or str(x).lower() == "n":
#             rating = np.array([[df["ID"][i], 0]])
            rating = [[df["ID"][i], 1]
            rating_df.append(pd.DataFrame(rating, columns=["ID", "CAP6317_rating"]), ignore_index=True)
#             rating_df = rating_df.append(pd.DataFrame(rating, columns=["ID", "CAP6317_rating"]), ignore_index=True)
        else:
            print("Input is not accepted. Please, provide a valid response.")
            
#         rater_mat = np.append(rater_mat, rating, axis=0)
            
            
        if i % 3 == 0:
#             print(rater_mat)
#             rating_df = rating_df.append(pd.DataFrame(rater_mat, columns=["ID", "CAP6317_rating"]), ignore_index=True)
            print(rating_df)
            
#             rater_df = pd.DataFrame(rater_mat, columns=["ID", "CAP6317_rating"]) 
            rating_df.to_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4_w_ratings.csv", index=False, header=True)
                
rater()    

In [None]:
import numpy as np 
import pandas as pd 


def rater():
    df = pd.read_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4.csv")
    rating_df = pd.read_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4_w_ratings.csv")
#     print(rating_df)
    
    for i in range(rating_df.shape[0]):
        if rating_df.iloc[i, 1] == -999:
            print(rating_df.iloc[i, 1])
            print("i:", i)
            print("\n")
            print("=" * 80)
            print("Tweet text:", df.iloc[i, 0])

            print("-" * 80)
            print('Is it an ANTI_MASK tweet? y(1) or n(0)')
            x = input()

            if str(x) == "1" or str(x).lower() == "y":
                rating_df.iloc[i, -1] = 1
            elif str(x) == "0" or str(x).lower() == "n":
                rating_df.iloc[i, -1] = 0
            else:
                print("Input is not accepted. Please, provide a valid response.")

            if i % 3 == 0:
#                 print(rating_df)
                rating_df.to_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4_w_ratings.csv", index=False, header=True)
                
rater()   

-999
i: 0


Tweet text: RT @TheRightMelissa: @Acosta Neither will I. #nomask because it’s not about “public health” it’s about control. You already demonstrated th…
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 1


Tweet text: so true!!!  #nomasks https://t.co/4QVwCuALqd
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 2


Tweet text: Happy Labor Day wknd and Greetings from NBC Studios! quick video about precautions this holiday wknd and #COVID19 #vaccine candidates. 

#wearamask #PhysicalDistancing https://t.co/RPCRz76Jrt
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 3


Tweet text: RT @Twitter: The only thing we want going viral is this Tweet

#WearAMask
--------------------------------------------------------------------------------
Is it

1
-999
i: 22


Tweet text: RT @JUSTICETIME7: @LeahR77 Tina and Ben nail it. #NoMasks @GrrrGraphics are propaganda and MindWarfare® https://t.co/CF1IJOGhQd
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 23


Tweet text: Alternative Government https://t.co/76ipSX9WH4 #KBF #NoMasks #Covid19UK
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 24


Tweet text: RT @DrTedros: Thank you, #BTS for the uplifting #BTS_Dynamite and for reminding the #BTSARMY and the rest of us to #WearAMask and take care…
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 25


Tweet text: @portarican_RT I’m with you in spirit. So thankful for my fellow healthcare workers and all they are sacrificing to get people through a horrible illness.  
Stay safe. #WearADamnMask
--------

0
-999
i: 44


Tweet text: RT @TheRightMelissa: @Acosta Neither will I. #nomask because it’s not about “public health” it’s about control. You already demonstrated th…
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 45


Tweet text: Like for another cool surprise :)
#WearAMask
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 46


Tweet text: RT @EmilyAinCA: I feel like I would not be able to preventatively check myself into a hospital. #COVID19 #GOPSuperSpreaders #WearAMask
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 47


Tweet text: #SantaMonica will begin targeted enforcement for face coverings in business districts &amp; parks this week. 

Save $100 &amp; save your health by masking up 😷✌️🌴#WearAMask https://t.co/exVGKSjL4L
--------------

0
-999
i: 65


Tweet text: 6 #COVID19 facts the #MSMIsTheEnemyOfThePeople won't report.

#ChineseVirus 
#China 
#NoMasks 

https://t.co/V8PD84rFBI via @YouTube
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 66


Tweet text: RT @TheRightMelissa: @Acosta Neither will I. #nomask because it’s not about “public health” it’s about control. You already demonstrated th…
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 67


Tweet text: RT @BionicBirdAnna: My takeaway from this are these two screenshots. FFS please,  #StayHomeSaveLives #WearAMask https://t.co/sPbr2C9Usa htt…
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 68


Tweet text: Where is it written in our Constitution 
that you have the right to demand I
wear a covering over my face?  This
is n

0
-999
i: 87


Tweet text: @dominiquetaegon If there is a #police enforced instant #fine of £100 for not wearing a mask then being stopped and searched and then found in possession #knives or #drugs then must be at least a 5year sentence . #nomask #liberty
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 88


Tweet text: can we just stop this whole #NoMasks debate and just wear a mask. if you have a serious reason why (wether that’s a seen or hidden condition that the mask will negatively impact) then fair enough. if not just 🤫🤫 and wear a mask it’s not hard
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 89


Tweet text: Simple as that!!  #NoMasks #wwg1wgaWorldWide https://t.co/fNOtGCga4r
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 90


Twe

0
-999
i: 108


Tweet text: @NotHoodlum @ReesusP 🎶
You're so vain 
You probably think a facemask's about you 
Your so vain 
I'll bet you think protection's about 
Don't you?
Don't you?

#WearAMask
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 109


Tweet text: To those massive snowflakes who cancelled their Tory membership because of a mask but not because of Austerity, Xenophobia, Teacher, Nurse, Police or Health care worker bashing, fraud, contracts for their mates, wasting public money. You really are a spineless cunt.
#NoMasks
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 110


Tweet text: @SaraCarterDC @tedcruz @AmericanAir https://t.co/dPOUNcBZv0 the truth always comes out!! #nomask
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 111


0
-999
i: 129


Tweet text: @Fineneighborho2 @prettystronger @ariannahuff @KapeciaResists #WearAMaskSaveALife campaign is a public health measure,  not a topic for political debate https://t.co/0tF7TUbxuT
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 130


Tweet text: @JoeBiden says he’d be willing to shut country down to stop coronavirus if scientists recommended it. NO THANKS, JOE. #NoMaskMandate
#Trump2020LandslideVictory
https://t.co/j1Vo3piiro
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 131


Tweet text: Hundreds gather in #HydePark to protest against wearing masks

#WearAMaskSaveALife

 https://t.co/Lgz9Mi9zz7
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 132


Tweet text: 😷🙏🏻😷 🙏🏻😷 This is not a political stance it is a HUMAN stan

0
-999
i: 150


Tweet text: RT @BriansNewHeart: #Resisters

It is important to remember your safety comes first. When you #WearAMask, you are not only protecting yours…
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 151


Tweet text: Costco, Costco....
You dumb fucking cunts!!!!!!!!!!!!!!

#DoNotComply
#NoMask https://t.co/EwLldnz1RV
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 152


Tweet text: @GovMikeDeWine We’ve dropped below pandemic numbers!!!

Stop this foolishness!!

#NoMasks 
#Biden2020 🤪 https://t.co/rOoQm5MOYW
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 153


Tweet text: #nomask Charlie Baker orders people to wear masks in public, no matter how far they are from other people - The Boston Globe https://t.co/ojH4MDNFnc
--------

1
-999
i: 171


Tweet text: RT @sarasteinmd: Must read #COVID19 thread. Please Please #WearADamnMask https://t.co/9qbzwUa47T
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 172


Tweet text: RT @DrMartaPPrado: Protect others. Be safe. Do your part.
We are stronger together
#WearAMask https://t.co/M1rQ9ZtLTn
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 173


Tweet text: @animegirl12182 @WorstNi79499146 @SavannahLMaddox @KY_HenryClay No proof!! 99.98% survival rate...HCQ COCKTAIL CURE...so deadly you need a test to tell you u have it!! Plus, a positive could mean you have antibodies from THE COMMON COLD!  Get outta here, Karen! 🤨 #NoMasks
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 174


Tweet text: RT @JoyAnnReid: When Republicans won’t b

1
-999
i: 193


Tweet text: RT @and_kell: Why I'm so selfish...#nomask #novaccines 

#Vaccinated my children. Son suffered irrepable harm. Now an adult who can't speak…
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 194


Tweet text: RT @BirthplaceBBall: Our leader Coach Brock, along with Americas top coaches, sends a message about the fight against COVID. #wearamask 😷
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 195


Tweet text: RT @DrTedros: Thank you, @BTS_twt J-Hope for reminding #BTSARMY to #WearAMask as families gather together and celebrate #ChuSeok holiday th…
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 196


Tweet text: RT @EmmaRoseButler: Why is everyone losing their shit about having to cover their faces to protect vulnerabl

0
-999
i: 215


Tweet text: Love @SenatorBobHall 

"...without masks and without restrictions." 💯🙌🚫😷🚫 @GovAbbott #nomasks #nomandates #freetexas https://t.co/uAJHmpDba2
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 216


Tweet text: These two Americans remain at the top of their respective fields while getting chemotherapy for pancreatic cancer, both at an age when they are more vulnerable to COVID-19

Patients with a deadly disease really do put life in perspective for the relatively healthy

#WearAMask https://t.co/f63PiFbKBU
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 217


Tweet text: RT @DrEricDing: OMG! Who did this?!?! Best 2 minutes of your day, week, month, maybe year. 

#WearAMask #SleepingBeauty #COVID19 #Masks #Ma…
--------------------------------------------------------------------------------
Is it an ANT

1
-999
i: 236


Tweet text: TGIF everyone!!  Enjoy the day and weekend.  Be kind, be happy and take care of yourself .

#wendoverartgroup
#dallasmarketcenter
#tgifriday
#wearamask https://t.co/MkeJHqIxMs
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 237


Tweet text: @cjensen_MT Brainwashing to die for.

#WWG1GWA  #WearAMask 
#SoldiersForDollars
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 238


Tweet text: [They] want you living in fear.

[They] will not stop until [THEY] understand FEAR. 

Stand your ground. 

#NoMask
#NoVax
#StandYourGround
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 239


Tweet text: What no decorated face shield, Dr. Birx?  Interesting the interviewer isn't wearing a mask. Such💩 #NoMasks #NoVaccine. #FireBirx👇🤬 ht

0
-999
i: 258


Tweet text: SHHHH: no one tell @realDonaldTrump that the more supporters he endangers in his #NoMasks, no #SocialDistance rallies, the higher the chance many won’t “be here” to #vote for him in the #Election2020 🙏🏼🙏🏼🙏🏼#BidenHarrisToSaveAmerica #AmericasGreatestMistake #Biden2020
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 259


Tweet text: Malaysia morning news for June 2

via @aecnewstoday 

#Independent #Asean #journalism based in #Cambodia 

#StayHome #isolate #prepare #WearAMask

https://t.co/FHASMSKvaU https://t.co/s27lz469c4
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 260


Tweet text: RT @LikeFineWine63: Now I am about to throw the F up 🤮🤮🤮🤮😡😡😡😡
#NoMasks https://t.co/QopGdB8ifi
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0

0
-999
i: 279


Tweet text: Please retweet. Really good Mom. #WearAMask https://t.co/ORUPVgY8nY
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 280


Tweet text: The time to openly resist has come, any cop waving a fine in my face will get a fist full of five in the breathing gear!

#IWillNotComply #WeMustNotComply #NoLockdown #NoMasks #Resist  #UnitedWeResist https://t.co/hdZkNvoIEq
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 281


Tweet text: @JReinerMD The @stanford study that shows 30,000 COVID and 700 deaths attributed to his rallies.  #VoteLikeYourLifeDependsOnIt #StopVoterSuppression #WearAMask #USACoronaExceedsIndia
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 282


Tweet text: @ScottishFotoExp Stop the forehead scans, nonsense m

1
-999
i: 300


Tweet text: The Left's push for mandatory face masks is full throttle, even in my state of Indiana.

Governor Holcomb so far is not indicating he will ever issue an executive order to mandate. Doesn't matter, I will NEVER submit!

It's a violation of the 10th Amendment anyway!

#NoMasks
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 301


Tweet text: RT @DrRobDavidson: I #WearAMask for my entire shift in the ER. There are ZERO harmful effects of wearing a mask, and you could save someone…
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 302


Tweet text: @DVATW Absolutely. Politicians will milk this one for all its worth.

#NoMasks
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 303


Tweet text: A new study demonstrates that th

0
-999
i: 321


Tweet text: RT @johnlegend: Everyone, please #WearAMask and social distance this weekend and for  the foreseeable future until we beat this disease!  R…
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 322


Tweet text: This is brilliant!! #NoMasks #KeepBritainFree https://t.co/gqmZOQKVcp
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
1
-999
i: 323


Tweet text: Trump was busy coming up with nicknames for his rival tweets during that briefing. #TRE45ON #TraitorTrump #RepublicansForBiden #RepublicansAgainstTrump #WearAMask https://t.co/3DYmhTGJDW
--------------------------------------------------------------------------------
Is it an ANTI_MASK tweet? y(1) or n(0)
0
-999
i: 324


Tweet text: RT @Renee53211: Time to #EndTheShutdown and fully open our country. And #nomasks as they don’t work and they are another sign of o

In [None]:
df = pd.read_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4.csv")

df2 = df[["ID"]]
df2["CAP6317_rating"] = [-999] * df2.shape[0]

df2.shape

df2.to_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4_w_ratings.csv", index=False, header=True)

In [None]:
df2.dtypes

In [3]:
from nltk import agreement
rater1 = [1,1,1]
rater2 = [1,1,0]
rater3 = [0,1,1]

taskdata=[[0,str(i),str(rater1[i])] for i in range(0,len(rater1))]+[[1,str(i),str(rater2[i])] for i in range(0,len(rater2))]+[[2,str(i),str(rater3[i])] for i in range(0,len(rater3))]
ratingtask = agreement.AnnotationTask(data=taskdata)
print("kappa " +str(ratingtask.kappa()))
print("fleiss " + str(ratingtask.multi_kappa()))
print("alpha " +str(ratingtask.alpha()))
print("scotts " + str(ratingtask.pi()))

kappa -0.1666666666666667
fleiss -0.2000000000000003
alpha -0.1428571428571428
scotts -0.28571428571428603


In [5]:
taskdata

[[0, '0', '1'],
 [0, '1', '1'],
 [0, '2', '1'],
 [1, '0', '1'],
 [1, '1', '1'],
 [1, '2', '0'],
 [2, '0', '0'],
 [2, '1', '1'],
 [2, '2', '1']]

In [6]:
ratingtask = agreement.AnnotationTask(data=taskdata)

In [7]:
ratingtask

<nltk.metrics.agreement.AnnotationTask at 0x7f8f603d8f90>

In [8]:
from statsmodels.stats.inter_rater import fleiss_kappa

In [12]:
fleiss_kappa(np.array(taskdata).astype(int), method='fleiss')

AssertionError: 

In [13]:
from sklearn.metrics import cohen_kappa_score

coder1 = [1,0,2,0,1,1,2,0,1,1]
coder2 = [1,1,0,0,1,1,2,1,1,0]
score = cohen_kappa_score(coder1,coder2)

print('Cohen\'s Kappa:',score)

Cohen's Kappa: 0.3220338983050848


In [3]:
r_df = pd.read_csv(os.path.join("../data/stack_files/balanced_pro_n_anti_mask_df_v4_w_ratings.csv"))

In [16]:
r_df2 = r_df.iloc[:320, :]

In [17]:
r_df2.columns

Index(['ID', 'CAP6317_rating'], dtype='object')

In [2]:
df = pd.read_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4.csv")

In [19]:
df.shape

(3201, 24)

In [23]:
m_df = pd.merge(df[df["ID", "ground_truth"]], r_df, on=["ID"], sort=True)

KeyError: ('ID', 'ground_truth')

In [38]:
x = df[["ID", "ground_truth"]].iloc[:320, :].copy()

In [40]:
x.head()

Unnamed: 0,ID,ground_truth
0,0_June_0000412,0
1,0_October_0000734,0
2,1_September_0000681,1
3,1_September_0004729,1
4,0_September_0001105,0


In [27]:
r_df2.shape

(320, 2)

In [30]:
x.dtypes

ID              object
ground_truth     int64
dtype: object

In [31]:
r_df2.dtypes

ID                object
CAP6317_rating     int64
dtype: object

In [41]:
x["ground_truth"] = x[x["ground_truth"] == 0] = 100

In [42]:
x.head()

Unnamed: 0,ID,ground_truth
0,0_June_0000412,100
1,0_October_0000734,100
2,1_September_0000681,100
3,1_September_0004729,100
4,0_September_0001105,100


In [49]:
df_2 = df[["ID", "ground_truth"]]

df_2 = df_2.iloc[:320, :].copy()

In [50]:
df_2.head()

Unnamed: 0,ID,ground_truth
0,0_June_0000412,0
1,0_October_0000734,0
2,1_September_0000681,1
3,1_September_0004729,1
4,0_September_0001105,0


In [55]:
df_2["ground_truth"] = df_2["ground_truth"].replace([0], 100)

In [56]:
df_2.head()

Unnamed: 0,ID,ground_truth
0,0_June_0000412,5
1,0_October_0000734,5
2,1_September_0000681,5
3,1_September_0004729,5
4,0_September_0001105,5


In [4]:
r_df.shape

(3201, 2)

In [5]:
df.shape

(3201, 24)

In [18]:
m_df = pd.merge(r_df, df[["ID", "ground_truth"]].copy(), on=["ID"])

m_df["ground_truth"] = m_df["ground_truth"].astype(bool)

In [20]:
m_df["ground_truth"] = m_df["ground_truth"].replace({0: 1, 1: 0})

In [31]:
m_df.head()


m_df = m_df.iloc[:320, :].copy()

In [32]:
from sklearn.metrics import cohen_kappa_score

# coder1 = [1,0,2,0,1,1,2,0,1,1]
# coder2 = [1,1,0,0,1,1,2,1,1,0]
score = cohen_kappa_score(m_df["ground_truth"], m_df["CAP6317_rating"])

print('Cohen\'s Kappa:',score)

Cohen's Kappa: 0.87998420844848


In [25]:
m_df.head(20)

Unnamed: 0,ID,CAP6317_rating,ground_truth
0,0_June_0000412,1,1
1,0_October_0000734,1,1
2,1_September_0000681,0,0
3,1_September_0004729,0,0
4,0_September_0001105,1,1
5,1_July_0000853,0,0
6,1_June_0002499,0,0
7,1_July_0004618,0,0
8,1_July_0002832,0,0
9,0_September_0000180,1,1


In [26]:
from sklearn.metrics import confusion_matrix

In [33]:
confusion_matrix(m_df["ground_truth"], m_df["CAP6317_rating"])

array([[169,   1],
       [ 18, 132]])

In [28]:
m_df["ground_truth"].unique().tolist()

[1, 0]

In [29]:
m_df["CAP6317_rating"].unique().tolist()

[1, 0, -999]

In [30]:
m_df.shape

(3201, 3)

In [34]:
jx_df = pd.read_csv("../data/stack_files/balanced_pro_n_anti_mask_df_v4_labeled_jx.csv")

In [35]:
jx_df.shape

(3201, 25)

In [64]:
m_df2 = pd.merge(m_df, jx_df[["ID", "Sarcasm"]].copy(), on=["ID"])

In [38]:
jx_df.columns

Index(['Tweet Text', 'Sarcasm', 'Tweet Datetime', 'Tweet Id', 'User Id',
       'User Name', 'User Location', 'Tweet Coordinates', 'Place Info',
       'Country', 'Hashtags', 'Retweets', 'Favorites', 'Language', 'Source',
       'Replied Tweet Id', 'Replied Tweet User Id', 'month', 'ground_truth',
       'ID', 'folds', 'vader_neg', 'vader_neu', 'vader_pos', 'vader_compound'],
      dtype='object')

In [41]:
m_df2.head()

Unnamed: 0,ID,CAP6317_rating,ground_truth,Sarcasm
0,0_June_0000412,1,1,0.0
1,0_October_0000734,1,1,0.0
2,1_September_0000681,0,0,0.0
3,1_September_0004729,0,0,1.0
4,0_September_0001105,1,1,0.0


In [65]:
# m_df2["Sarcasm"] = m_df2["Sarcasm"].replace({0: 1, 1:0})
# m_df2["Sarcasm"] = m_df2["Sarcasm"].astype(int)

m_df2 = m_df2.iloc[:318, :].copy()
m_df2["Sarcasm"] = m_df2["Sarcasm"].astype(int)

m_df2.head(20)

Unnamed: 0,ID,CAP6317_rating,ground_truth,Sarcasm
0,0_June_0000412,1,1,0
1,0_October_0000734,1,1,0
2,1_September_0000681,0,0,0
3,1_September_0004729,0,0,1
4,0_September_0001105,1,1,0
5,1_July_0000853,0,0,0
6,1_June_0002499,0,0,1
7,1_July_0004618,0,0,0
8,1_July_0002832,0,0,0
9,0_September_0000180,1,1,0


In [48]:
m_df2.head()

Unnamed: 0,ID,CAP6317_rating,ground_truth,Sarcasm
0,0_June_0000412,1,1,0.0
1,0_October_0000734,1,1,0.0
2,1_September_0000681,0,0,0.0
3,1_September_0004729,0,0,1.0
4,0_September_0001105,1,1,0.0


In [66]:
print(confusion_matrix(m_df2["ground_truth"], m_df2["Sarcasm"].astype(bool)))


print(cohen_kappa_score(m_df2["CAP6317_rating"], m_df2["Sarcasm"].astype(int)))

[[152  17]
 [128  21]]
0.0446351931330472


In [53]:
m_df2["Sarcasm"].unique().tolist()

[0.0, 1.0, nan]