## Import Libraries and Frameworks

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import spacy
import pickle

from textblob import TextBlob
from spacytextblob.spacytextblob import SpacyTextBlob
import regex as re
from spacy.lang.en.stop_words import STOP_WORDS
from datetime import datetime

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x7fc21e4f4730>

## Pre-process functions

In [None]:
def get_mode(x):
    modes = pd.Series.mode(x)
    if len(modes) == 1:
        return modes[0]
    else:
        return "UNCLEAR"

In [None]:
def move_links_and_punc(df, X_col):
    """ 
    Create new columns 'word_count', 'num_links', 'has_links', and a clean version of the text column that has 
    urls, punctuation, and numbers removed
    
    Args
        df - pandas dataframe
        X_col - name of text column to be cleaned and extracted from
    """

    urlregex = r'(http\S+|www\S+)'
    numregex = r'\d+'
    puncregex = r'[^\w\s]'

    df['links'] = df[X_col].apply(lambda x: re.findall(urlregex, str(x)))
    df[X_col] = df[X_col].fillna({'data':''})
    df['clean_' + X_col] = df[X_col].replace(urlregex, '', regex=True).str.lower()
    df['clean_' + X_col] = df['clean_' + X_col].replace(puncregex, '', regex=True)
    df['clean_' + X_col] = df['clean_' + X_col].replace(numregex, '', regex=True)
    df['clean_' + X_col] = df['clean_' + X_col].apply(lambda x: "".join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
    df['clean_' + X_col] = df['clean_' + X_col].apply(lambda x: " ".join(x.strip() for x in x.split()))
    
    df['word_count'] = [len(x) for x in df['clean_' + X_col]]
    df['num_links'] = [len(x) for x in df['links']]
    df['has_links'] = [1 if x > 0 else 0 for x in df['num_links']]

In [None]:
def remove_stopwords(df, X_col):
    """ 
    Removes stopwords from the given text column in a dataframe
    
    Args
        df - pandas dataframe
        X_col - name of text column to be cleaned and extracted from
    """

    arr = []
    docs = nlp.pipe(df[X_col])
    for doc in docs:
        arr.append([str(tok.lemma_) for tok in doc if tok.text not in STOP_WORDS] )
    df[X_col] = arr

In [None]:
# Sentiment polarity and subjectivity functions
def sentiment_polarity(text):
    return text.apply(lambda Text: pd.Series(TextBlob(' '.join(Text)).sentiment.polarity))

def sentiment_subjectivity(text):
    return text.apply(lambda Text: pd.Series(TextBlob(' '.join(Text)).sentiment.subjectivity))

In [None]:
d = {'tweets':tweets}
with open('temp.pickle', 'wb') as handle:
    pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Convert 'helpful' column into a trinary where each category is dependent on its average helpfulness rating
def fixHelpfulness(df, col, min=0.29, max=0.84):
    """
    Takes df, col
    0.84 and 0.29 are the cut-offs suggested by Twitter themselves
    """

    mask = df[col] >= max
    df.loc[mask, col] = 2
    mask = df[col].between(min, max, inclusive=False)
    df.loc[mask, col] = 1
    mask = df[col] <= min
    df.loc[mask, col] = 0

In [None]:
def drop_rows_with_empty_features(X, y):
    is_zero = [len(element) == 0 for element in X]
    print("dropping ", len(is_zero), " rows")
    X = [element for element, drop in zip(X, is_zero) if not drop]
    y = [element for element, drop in zip(y, is_zero) if not drop]
    return X, y

In [None]:
def get_mode(x):
    modes = pd.Series.mode(x)
    if len(modes) == 1:
        return modes[0]
    else:
        return "UNCLEAR"

In [None]:
def timestamp(x):
    try:
        dt = datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    except Exception as e:
        return
    epoch = datetime.utcfromtimestamp(0)
    return (dt - epoch).total_seconds() * 1000.0

In [None]:
notes = pd.read_csv('notes.tsv', sep="\t")
#ratings = pd.read_csv(ratings_filename, sep="\t")

In [None]:
notes.summary

0       This is a image of a Tweet that never occured,...
1       BLM as an organization is not a terrorist orga...
2       This post claims BLM as an organization does n...
3       This claim is false. Carter, Ford and Nixon al...
4       Neither BLM nor antifa were involved in the at...
                              ...                        
8708    This image has been doctored. Ashraf Mohi, the...
8709    This is an old photo of a child who lives in R...
8710    This is an old photo of a child who lives in R...
8711    This is an old photo of a child who lives in R...
8712    The photo is from a Najwan Simri Diab's &quot;...
Name: summary, Length: 8713, dtype: object

In [None]:
ratings = pd.read_csv('ratings.tsv', sep="\t")


In [None]:
ratings

Unnamed: 0,noteId,participantId,createdAtMillis,agree,disagree,helpful,notHelpful,helpfulOther,helpfulInformative,helpfulClear,...,notHelpfulOther,notHelpfulIncorrect,notHelpfulSourcesMissingOrUnreliable,notHelpfulOpinionSpeculationOrBias,notHelpfulMissingKeyPoints,notHelpfulOutdated,notHelpfulHardToUnderstand,notHelpfulArgumentativeOrInflammatory,notHelpfulOffTopic,notHelpfulSpamHarassmentOrAbuse
0,1352796878438424576,7644DF3FD853416F0C96933CCC1BA9B7,1611796572477,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,1352796878438424576,7585B8804A32416E91E51837F351F249,1611388222120,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1352796878438424576,628C786C63B5A4D32E13C6C442E1863D,1611623203338,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1352796878438424576,0D7ED07D5421118311EEED5E4ECF2968,1611860252442,1,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1352796878438424576,EFD7E04E740224D2DDB42A2C910B62C1,1611852744990,1,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24540,1398776688566280193,D6AF93837A1935E2421A23BD9185ED35,1622333282757,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
24541,1398786944755191818,AECA621BD49DBD17B585EFF22D682450,1622332838727,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
24542,1398787894425313284,AECA621BD49DBD17B585EFF22D682450,1622332816300,0,1,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0
24543,1398789399811342338,AECA621BD49DBD17B585EFF22D682450,1622332797727,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
tweets = pd.read_csv('noted-tweets.csv', sep=",")

In [None]:
tweets.text.str.len().value_counts()

177.0    694
23.0     241
280.0    141
279.0    118
304.0    109
        ... 
435.0      1
354.0      1
325.0      1
327.0      1
834.0      1
Name: text, Length: 350, dtype: int64

In [None]:
tweets[tweets.text.isna()]

Unnamed: 0.1,Unnamed: 0,tweetId,username,userId,text,time,link
0,0,1353031711349739523,,,,,
8,8,1354905286264430608,,,,,
32,32,1361491641488572416,,,,,
43,43,1362436244425347076,,,,,
48,48,1358172874973003780,,,,,
...,...,...,...,...,...,...,...
12589,86,1436454765811179537,,,,,
12601,98,1437549713667067905,,,,,
12658,55,1440058488785231880,,,,,
12688,85,1440002167499378689,,,,,


In [None]:
def consolidate_files(notes_filename, ratings_filename, tweets_filename, start_date='01/01/2021', end_date='01/01/2022'):
    """
    Take in the data as provided by twitter and output
    a "ready for ML" version.

    Args
        notes_filename - string filename of the notes csv file
        ratings_filename - string filename of the ratings csv file 
        tweets_filename - string filename of the tweets csv file
        start_date - string date of the format month/day/year of the start bound of the timeframe
        end_date - string date of the format month/day/year of the end bound of the timeframe

    Output
        notesWithRatings - pandas dataframe joining notes and their corresponding ratings
        tweetsWithNotes - pandas dataframe joining tweets and their corresponding notes
    """
    
    
    # read files
    notes = pd.read_csv(notes_filename, sep="\t")
    ratings = pd.read_csv(ratings_filename, sep="\t")
    tweets = pd.read_csv(tweets_filename, sep=",")

    # adjust timeframes
    start_date = datetime.strptime(start_date, '%m/%d/%Y').timestamp() * 1000
    end_date = datetime.strptime(end_date, '%m/%d/%Y').timestamp() * 1000

    # print
    tweets.dropna(inplace=True, how='any')
    tweets['createdAtMillis'] = tweets['time'].apply(lambda x: timestamp(x[:19]))

    notes = notes[(notes.createdAtMillis.astype(int) > start_date) & (notes.createdAtMillis.astype(int) < end_date)]
    ratings = ratings[(ratings.createdAtMillis.astype(int) > start_date) & (ratings.createdAtMillis.astype(int) < end_date)]
    tweets = tweets[(tweets.createdAtMillis.astype(int) > start_date) & (tweets.createdAtMillis.astype(int) < end_date)]

    # consolidate dataframes
    ratingsWithNotes = notes.set_index('noteId').join(ratings.set_index('noteId'), lsuffix="_note", rsuffix="_rating", how='inner')
    average_ratings = ratings.groupby('noteId').mean()
    average_notes = notes[['tweetId', 'classification', 'believable', 'harmful', 'validationDifficulty']]
    tweet_id_to_mode = average_notes.groupby(['tweetId']).agg(get_mode)
    tweetsWithNotes = tweets.set_index('tweetId').join(tweet_id_to_mode, lsuffix="_tweet", rsuffix="_note", how='inner')
    notesWithRatings = notes.set_index('noteId').join(average_ratings, lsuffix="_note", rsuffix="_rating")

    tweetsWithNotes.dropna(inplace=True, how='any')
    notesWithRatings.dropna(inplace=True, how='any')

    # clean up text
    move_links_and_punc(notesWithRatings, 'summary')
    remove_stopwords(notesWithRatings, 'clean_summary')

    move_links_and_punc(tweetsWithNotes, 'text')
    remove_stopwords(tweetsWithNotes, 'clean_text')

    # create polarity and subjectivity columns
    notesWithRatings['polarity'] = sentiment_polarity(notesWithRatings['clean_summary'])
    notesWithRatings['subjectivity'] = sentiment_subjectivity(notesWithRatings['clean_summary'])

    tweetsWithNotes['polarity'] = sentiment_polarity(tweetsWithNotes['clean_text'])
    tweetsWithNotes['subjectivity'] = sentiment_subjectivity(tweetsWithNotes['clean_text'])

    # convert helpfulness score into Twitter specified divisinos
    fixHelpfulness(notesWithRatings, 'helpful')

    # create additional columns for nlp use
    notesWithRatings['clean_summary_as_str'] = notesWithRatings['clean_summary'].apply(lambda x: ' '.join(x))
    tweetsWithNotes['clean_text_as_str'] = tweetsWithNotes['clean_text'].apply(lambda x: ' '.join(x))

    return notesWithRatings, tweetsWithNotes

### Pickle Dictionary Data for Use in Other Notebooks

In [None]:
dates = [
    '02/01/2021', '02/15/2021', '03/01/2021', '03/15/2021', '04/01/2021',
    '04/15/2021', '05/01/2021', '05/15/2021', 
    '06/01/2021', '06/15/2021',
    '07/01/2021', '07/15/2021',
    '08/01/2021', '08/15/2021',
    '09/01/2021', '09/15/2021',
    
]
dictionary = {}
for date in dates:
    print('processing ' + date)
    notesWithRatings, tweetsWithNotes = consolidate_files('notes-2.tsv', 'ratings-2.tsv', 'noted-tweets.csv', end_date=date)
    dictionary['notes ' + date] = notesWithRatings
    dictionary['tweets ' + date] = tweetsWithNotes

processing 02/01/2021
processing 02/15/2021
processing 03/01/2021
processing 03/15/2021
processing 04/01/2021
processing 04/15/2021
processing 05/01/2021
processing 05/15/2021
processing 06/01/2021
processing 06/15/2021
processing 07/01/2021
processing 07/15/2021
processing 08/01/2021
processing 08/15/2021
processing 09/01/2021
processing 09/15/2021


In [None]:
with open('processed-3.pickle', 'wb') as handle:
    pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c0e62a2c-7f45-414e-8164-5bf51e09d482' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>