## Import Libraries and Frameworks

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import spacy
import pickle

from textblob import TextBlob
from spacytextblob.spacytextblob import SpacyTextBlob
import regex as re
from spacy.lang.en.stop_words import STOP_WORDS
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
nlp = spacy.load("../src/en_core_web_md/en_core_web_md-3.2.0")

## Pre-process functions

In [None]:
def move_links_and_punc(df, X_col='text'):
    """ 
    Create new columns:
    and a clean version of the text column that has 
    urls, punctuation, and numbers removed
    
    Args
        df - pandas dataframe
        X_col - name of text column to be cleaned and extracted from
    """
    clean_col = 'clean_' + X_col
    urlregex = r'(http\S+|www\S+)'
    numregex = r'\d+'
    puncregex = r'[^\w\s]'

    # fill empty nas with empty string.
    df[X_col] = df[X_col].fillna({'data':''})
    # find links, then replace url with regex
    df['links'] = df[X_col].apply(lambda x: re.findall(urlregex, str(x)))
    df[clean_col] = df[X_col].replace(urlregex, '', regex=True).str.lower()

    # find punc, then remove with regex
    df['has_questionmark'] = df[X_col].apply(lambda x: '?' in x)
    df['has_exclamationmark'] = df[X_col].apply(lambda x: '!' in x)
    df['has_period'] = df[X_col].apply(lambda x: '.' in x)
    df['num_upper'] = df[X_col].apply(lambda text: sum(char.isupper() for char in text))
    df[clean_col] = df[clean_col].replace(puncregex, '', regex=True)

    # replace digits with regex
    df['has_digit'] = df[clean_col].apply(lambda text: any(char.isdigit() for char in text))
    df[clean_col] = df[clean_col].replace(numregex, '', regex=True)

    df[clean_col] = df[clean_col].apply(
        lambda x: "".join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
    # remove whitespace
    df[clean_col] = df[clean_col].apply(lambda x: " ".join(x.strip() for x in x.split()))
    
    #df['word_count'] = [len(x) for x in df[clean_col]]
    df['char_count'] = [len(x) for x in df[clean_col]]
    df['num_links'] = [len(x) for x in df['links']]
    df['has_links'] = [1 if x > 0 else 0 for x in df['num_links']]

In [None]:
metadata_cols = [
    'has_questionmark', 'has_exclamationmark', 'has_period', 'num_upper',
    'has_digit', 'char_count',
    'num_links', 'has_links'
]

In [None]:
def lemma_and_remove_stopwords(df, X_col):
    """ 
    Removes stopwords from the given text column in a dataframe

    Modifies `df` directly
    
    Args
        df - pandas dataframe
        X_col - name of text column to be cleaned and extracted from
    """

    arr = []
    docs = nlp.pipe(df[X_col])
    for doc in docs:
        arr.append([str(tok.lemma_) for tok in doc if tok.text not in STOP_WORDS] )
    df[X_col] = arr

In [None]:
# Sentiment polarity and subjectivity functions
def sentiment_polarity(text):
    return text.apply(lambda x: pd.Series(TextBlob(' '.join(x)).sentiment.polarity))

def sentiment_subjectivity(text):
    return text.apply(lambda x: pd.Series(TextBlob(' '.join(x)).sentiment.subjectivity))

In [None]:
# Convert 'helpful' column into a trinary where each category is dependent on its average helpfulness rating
def binarizeHelpfulness(df, col, min=0.29, max=0.84):
    """
    Takes df, col
    0.84 and 0.29 are the cut-offs suggested by Twitter themselves

    For the binarize function, we'll just set notes with helpfulness  >= 0.84 as True.
    """

    mask = df[col] >= max
    df.loc[mask, col] = 1
    # mask = df[col].between(min, max, inclusive=False)
    # df.loc[mask, col] = 1
    mask = df[col] < max
    df.loc[mask, col] = 0

In [None]:
def timestamp(x):
    try:
        dt = datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    except Exception as e:
        return
    epoch = datetime.utcfromtimestamp(0)
    return (dt - epoch).total_seconds() * 1000.0

In [None]:
data_path = '../data'

In [None]:
notes = pd.read_csv(f'{data_path}/notes-2022-02-21.tsv', sep="\t")
ratings = pd.read_csv(f'{data_path}/ratings-2022-02-21.tsv', sep="\t")

In [None]:
len(notes)

25402

In [None]:
len(ratings)

189744

In [None]:
notes_ = notes.groupby(['tweetId']).size().to_frame('size')
ratings_ = ratings.groupby(['noteId']).size().to_frame('size')

In [None]:
len(ratings_[ratings_['size'] > 5])

7641

In [None]:
len(notes_[notes_['size'] > 5])

144

In [None]:
len(ratings['noteId'].unique()) - len(ratings[ratings['noteId'].isin(notes['noteId'])]['noteId'].unique())

0

In [None]:
ratings.head()

Unnamed: 0,noteId,participantId,createdAtMillis,version,agree,disagree,helpful,notHelpful,helpfulnessLevel,helpfulOther,...,notHelpfulOpinionSpeculationOrBias,notHelpfulMissingKeyPoints,notHelpfulOutdated,notHelpfulHardToUnderstand,notHelpfulArgumentativeOrBiased,notHelpfulOffTopic,notHelpfulSpamHarassmentOrAbuse,notHelpfulIrrelevantSources,notHelpfulOpinionSpeculation,notHelpfulNoteNotNeeded
0,1352796878438424576,0D7ED07D5421118311EEED5E4ECF2968,1611860252442,1,1,0,1,0,,0,...,0,0,0,0,0,0,0,0,0,0
1,1352796878438424576,0466BA47B23AAAB301AA767C4C780E8D,1611523319844,1,1,0,1,0,,0,...,0,0,0,0,0,0,0,0,0,0
2,1352796878438424576,628C786C63B5A4D32E13C6C442E1863D,1611623203338,1,0,0,0,1,,0,...,0,0,0,0,0,0,0,0,0,0
3,1352796878438424576,EFD7E04E740224D2DDB42A2C910B62C1,1611852744990,1,1,0,1,0,,0,...,0,0,0,0,0,0,0,0,0,0
4,1352796878438424576,7585B8804A32416E91E51837F351F249,1611388222120,1,1,0,1,0,,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
notes['dt'] = notes.createdAtMillis.apply(lambda x: datetime.fromtimestamp(x / 1e3))

In [None]:
tweets = pd.read_csv(f'{data_path}/noted-tweets.csv', sep=",")

In [None]:
tweets[tweets.text.isna()]

Unnamed: 0.1,Unnamed: 0,tweetId,username,userId,text,time,link
0,0,1353031711349739523,,,,,
8,8,1354905286264430608,,,,,
32,32,1361491641488572416,,,,,
43,43,1362436244425347076,,,,,
48,48,1358172874973003780,,,,,
...,...,...,...,...,...,...,...
19425,68,1487628841623298051,,,,,
19443,86,1490034424271958020,,,,,
19487,30,1474938979828973568,,,,,
19490,33,1481856765851492353,,,,,


In [None]:
ratings.columns

Index(['noteId', 'participantId', 'createdAtMillis', 'version', 'agree',
       'disagree', 'helpful', 'notHelpful', 'helpfulnessLevel', 'helpfulOther',
       'helpfulInformative', 'helpfulClear', 'helpfulEmpathetic',
       'helpfulGoodSources', 'helpfulUniqueContext', 'helpfulAddressesClaim',
       'helpfulImportantContext', 'helpfulUnbiasedLanguage', 'notHelpfulOther',
       'notHelpfulIncorrect', 'notHelpfulSourcesMissingOrUnreliable',
       'notHelpfulOpinionSpeculationOrBias', 'notHelpfulMissingKeyPoints',
       'notHelpfulOutdated', 'notHelpfulHardToUnderstand',
       'notHelpfulArgumentativeOrBiased', 'notHelpfulOffTopic',
       'notHelpfulSpamHarassmentOrAbuse', 'notHelpfulIrrelevantSources',
       'notHelpfulOpinionSpeculation', 'notHelpfulNoteNotNeeded'],
      dtype='object')

In [None]:
rating_quant_cols = [
    'notHelpful', 'helpfulOther', 'helpful_quant',
    'helpfulInformative', 'helpfulClear', 'helpfulEmpathetic',
    'helpfulGoodSources', 'helpfulUniqueContext', 'helpfulAddressesClaim',
    'helpfulImportantContext', 'notHelpfulOther', 'notHelpfulIncorrect',
    'notHelpfulSourcesMissingOrUnreliable',
    'notHelpfulOpinionSpeculationOrBias', 'notHelpfulMissingKeyPoints',
    'notHelpfulOutdated', 'notHelpfulHardToUnderstand',
    'notHelpfulArgumentativeOrBiased', 'notHelpfulOffTopic',
    'notHelpfulSpamHarassmentOrAbuse', 'notHelpfulIrrelevantSources',
    'helpfulUnbiasedLanguage', 'notHelpfulOpinionSpeculation', 
    'notHelpfulNoteNotNeeded'
]

In [None]:
val_map = {
    'CHALLENGING': 1,
    'EASY': 0
}
believable_map = {
    'BELIEVABLE_BY_MANY': 1,
    'BELIEVABLE_BY_FEW': 0,
}
classification_map = {
    'MISINFORMED_OR_POTENTIALLY_MISLEADING': 1,
    'NOT_MISLEADING': 0
}
harmful_map ={
    'CONSIDERABLE_HARM': 1,
    'LITTLE_HARM': 0,
}
helpfulnessLevel_map = {
    'HELPFUL': 1,
    'SOMEWHAT_HELPFUL' : 1,
    'NOT_HELPFUL': 0
}

In [None]:
note_quant_cols = [
    'classification_quant', 'believable_quant', 'harmful_quant', 'val_quant',

    'misleadingOther', 'misleadingFactualError', 'misleadingManipulatedMedia',
    'misleadingOutdatedInformation', 'misleadingMissingImportantContext',
    'misleadingUnverifiedClaimAsFact', 'misleadingSatire',
    
    'notMisleadingOther', 'notMisleadingFactuallyCorrect',
    'notMisleadingOutdatedButNotWhenWritten'
]

In [None]:
notes_ = pd.read_csv(f'{data_path}/notes-2022-02-21.tsv', sep="\t")
ratings_ = pd.read_csv(f'{data_path}/ratings-2022-02-21.tsv', sep="\t")

In [None]:
ratings_['helpfulnessLevel'].isna().sum()

36181

In [None]:
ratings_['helpful_quant'] = ratings_.helpfulnessLevel.map(helpfulnessLevel_map) 

In [None]:
def consolidate_files(
    notes, ratings, tweets, start_date='01/01/2021', end_date='02/16/2022'):
    """
    Take in the data as provided by twitter and output
    a "ready for ML" version.

    Args
        notes_filename - string filename of the notes csv file
        ratings_filename - string filename of the ratings csv file 
        tweets_filename - string filename of the tweets csv file
        start_date - string date of the format month/day/year of the start bound of the timeframe
        end_date - string date of the format month/day/year of the end bound of the timeframe

    Output
        notesWithRatings - pandas dataframe joining notes and their corresponding ratings
        tweetsWithNotes - pandas dataframe joining tweets and their corresponding notes
    """
    # get start and end date as datetime objects
    start_date = datetime.strptime(start_date, '%m/%d/%Y').timestamp() * 1000
    end_date = datetime.strptime(end_date, '%m/%d/%Y').timestamp() * 1000

    # print(
    #     '# of na tweets', len(tweets[tweets.isna()])
    #     )
    # # tweets.dropna(inplace=True, how='any')

    # filter by time
    notes = notes[(notes.createdAtMillis.astype(int) > start_date) & (notes.createdAtMillis.astype(int) < end_date)].copy()
    ratings = ratings[(ratings.createdAtMillis.astype(int) > start_date) & (ratings.createdAtMillis.astype(int) < end_date)].copy()
    tweets = tweets[(tweets.createdAtMillis.astype(int) > start_date) & (tweets.createdAtMillis.astype(int) < end_date)].copy()


    # note: this is relative to start_date and end_date. 
    # Doing this before filtering would give different results!
    average_notelabels = ratings[['noteId'] + rating_quant_cols].groupby('noteId').mean()
    average_tweetlabels = notes[['tweetId'] + note_quant_cols].groupby('tweetId').mean()
    # binarizeHelpfulness(average_notelabels, 'helpful_quant')

    notes_labeled = notes.set_index('noteId').join(
        average_notelabels
    )

    tweets_labeled = tweets.set_index('tweetId').join(
        average_tweetlabels
    )

    return notes_labeled, tweets_labeled

In [None]:
dates = [
    '02/01/2021', #'02/15/2021', 
#    '03/01/2021', '03/15/2021', 
    '04/01/2021', #'04/15/2021', 
#    '05/01/2021', '05/15/2021', 
    '06/01/2021', #'06/15/2021',
#    '07/01/2021', '07/15/2021',
    '08/01/2021', # '08/15/2021',
#    '09/01/2021', '09/15/2021',
    '10/01/2021', # '10/15/2021',
#    '11/01/2021', '11/15/2021',
    '12/01/2021',
    '02/01/2022',
]
dictionary = {}

notes_filename =  f'{data_path}/notes-2022-02-21.tsv'
ratings_filename = f'{data_path}/ratings-2022-02-21.tsv'
tweets_filename = f'{data_path}/noted-tweets.csv'

notes = pd.read_csv(notes_filename, sep="\t")
ratings = pd.read_csv(ratings_filename, sep="\t")
tweets = pd.read_csv(tweets_filename, sep=",")

print('checking for na values in note and tweet text')
print(sum(notes.summary.isna()))
print(sum(tweets.text.isna()))

# drop na summary and text
notes = notes[~notes.summary.isna()]
tweets = tweets[~tweets.text.isna()]

 # create quantitative columns (binary) for the main 4 TWEET labels
notes['classification_quant'] = notes.classification.map(classification_map)
notes['believable_quant'] = notes.believable.map(believable_map)
notes['harmful_quant'] = notes.harmful.map(harmful_map)
notes['val_quant'] = notes.validationDifficulty.map(val_map)

# and a consolidated helpful_quant column that combines the 'helpful' and 'helpfulnessLevel' labels

ratings['helpful_quant'] = ratings.helpfulnessLevel.map(helpfulnessLevel_map)
ratings.loc[ratings.helpfulnessLevel.isna(), 'helpful_quant'] = ratings[
    ratings.helpfulnessLevel.isna()].helpful

print('how many of helpful quant are nan: ', ratings['helpful_quant'].isna().sum())

tweets['createdAtMillis'] = tweets['time'].apply(lambda x: timestamp(x[:19]))


def do_nlp(notes, tweets, do_remove_stopwords=True):
    move_links_and_punc(notes, 'summary')
    move_links_and_punc(tweets, 'text')

    if do_remove_stopwords:
        lemma_and_remove_stopwords(notes, 'clean_summary')
        lemma_and_remove_stopwords(tweets, 'clean_text')
        
    notes['clean_txt'] = notes['clean_summary'].apply(lambda x: ' '.join(x))
    tweets['clean_txt'] = tweets['clean_text'].apply(lambda x: ' '.join(x))

    # Turn empty string into whitespace character to avoid bug later
    notes.loc[notes.clean_txt == '', 'clean_txt'] = ' '
    tweets.loc[tweets.clean_txt == '', 'clean_txt'] = ' '

    # create polarity and subjectivity columns
    notes['Polarity'] = sentiment_polarity(notes['clean_summary'])
    notes['Subjectivity'] = sentiment_subjectivity(notes['clean_summary'])

    tweets['Polarity'] = sentiment_polarity(tweets['clean_text'])
    tweets['Subjectivity'] = sentiment_subjectivity(tweets['clean_text'])

do_nlp(notes, tweets)


for date in dates:
    print('processing ' + date)
    labeled_notes, labeled_tweets = consolidate_files(
        notes, ratings, tweets, 
        end_date=date
    )

    print(len(labeled_notes), len(labeled_tweets))
    labeled_notes = labeled_notes[~labeled_notes.helpful_quant.isna()]
    labeled_tweets = labeled_tweets[~labeled_tweets.classification_quant.isna()]
    print('dropped na')

    dictionary['notes ' + date] = labeled_notes
    dictionary['tweets ' + date] = labeled_tweets

checking for na values in note and tweet text
2
2299
how many of helpful quant are nan:  0
processing 02/01/2021
1245 888
dropped na
processing 04/01/2021
5531 3797
dropped na
processing 06/01/2021
8345 5877
dropped na
processing 08/01/2021
11368 7845
dropped na
processing 10/01/2021
16611 11365
dropped na
processing 12/01/2021
21426 14273
dropped na
processing 02/01/2022
24800 16610
dropped na


### Pickle Dictionary Data for Use in Other Notebooks

In [None]:
with open(f'{data_path}/processed.pickle', 'wb') as handle:
    pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
notes_example = dictionary['notes 12/01/2021']

In [None]:
tweets_example = dictionary['tweets 12/01/2021']

In [None]:
notes_example['helpful_quant'].isna().sum(), len(notes_example)


(0, 14736)

In [None]:
tweets_example['classification_quant'].isna().sum(), len(tweets_example)

(0, 14141)

In [None]:
tweets_example[tweets_example.classification_quant.isna()]

Unnamed: 0_level_0,Unnamed: 0,username,userId,text,time,link,createdAtMillis,links,clean_text,has_questionmark,...,misleadingOther,misleadingFactualError,misleadingManipulatedMedia,misleadingOutdatedInformation,misleadingMissingImportantContext,misleadingUnverifiedClaimAsFact,misleadingSatire,notMisleadingOther,notMisleadingFactuallyCorrect,notMisleadingOutdatedButNotWhenWritten
tweetId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [None]:
notes_example.helpful_quant.isna().sum()

0

In [None]:
notes_example.helpfulClear.isna().sum()

0

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c0e62a2c-7f45-414e-8164-5bf51e09d482' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>