Goal: to derive information from tweet

Descriptive analysis of the text:
- using packages like NLTK, SpaCy 
- possible new features: 
    - .# of words,  # of chars, avg, length of words, most common words, occurrences of chars (e.g., ‘?’, ‘!’), most common smileys, most common multiple smiley occurrences, etc.). 

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..')))
from preprocessing.preprocessing import *
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

### load data

In [2]:
df = load_dataset(subset_name="climatevisions_2019_popular")
df = preprocess_dataset(df, selected_columns=None)

Items: tweet_data
(5000, 9)


In [3]:
df.head()

Unnamed: 0,created_at,img_name,language,like_count,quote_count,referenced_tweets,retweet_count,text,tweet_id
0,2019-07-22T12:38:24.000Z,id_1153283149360762880_2019-07-22.jpg,en,b'82582',b'3918',,b'50280',the UN released a 740 page report compiled ove...,1153283149360762880
1,2019-08-20T09:28:39.000Z,id_1163744643600637952_2019-08-20.jpg,en,b'69820',b'2456',,b'51781',"The Amazon Rainforest, one of the wettest plac...",1163744643600637952
2,2019-04-28T18:51:22.000Z,id_1122574040936452097_2019-04-28.jpg,en,b'69235',b'87',,b'11051',just learned about climate change https://t.co...,1122574040936452097
3,2019-10-28T13:10:13.000Z,id_1188805167958974465_2019-10-28.jpg,en,b'65465',b'70',,b'6124',Climate change caused this. https://t.co/JG2Ly...,1188805167958974465
4,2019-03-19T16:30:00.000Z,id_1108042949449969666_2019-03-19.jpg,en,b'62852',b'976',,b'9145',#GreenNewDeal haters’ plan to address Climate ...,1108042949449969666


In [4]:
tweet_text = df['text']
tweet_text.describe()


count                                                  5000
unique                                                 5000
top       the UN released a 740 page report compiled ove...
freq                                                      1
Name: text, dtype: object

In [36]:
import nltk
from nltk.corpus import stopwords
import string
import re
import pandas as pd

def preprocess_text(series, remove_stopwords=True, remove_punctuation=True):
    # Remove URLs -> all tweet texts contain link to tweet
    series = series.apply(lambda text: re.sub(r'http\S+', '', text))
    
    series = series.apply(lambda text: nltk.word_tokenize(text))
    if remove_stopwords:
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        stop_words.update(['s', '’', "''", '“', "'s", '”'])
        
        series = series.apply(lambda words: [word for word in words if word.lower() not in stop_words])

    if remove_punctuation:
        # Remove punctuation
        series = series.apply(lambda words: [word for word in words if word not in string.punctuation])

    return series

# Usage example:
series = pd.Series(["This is a sample text with stopwords and punctuation! I do banana & peanut butter) https://t.co/BnAg4x5lGS",
                    "Another example text with URLs and special characters!"])
preprocessed_series = preprocess_text(series)
print(preprocessed_series[0])


['sample', 'text', 'stopwords', 'punctuation', 'banana', 'peanut', 'butter']


### general metrics


In [37]:
tweet_text_no_punctuation = preprocess_text(tweet_text)
tweet_text_punctuation = preprocess_text(tweet_text, remove_punctuation=False)

# Calculate the number of words
num_words = tweet_text_no_punctuation.apply(len).sum()

# Calculate the number of characters
num_chars = tweet_text_no_punctuation.apply(lambda text: sum(len(word) for word in text)).sum()

# Calculate the average length of words
avg_word_length = num_chars / num_words

# Calculate the average length of a tweet text
avg_text_length = tweet_text_no_punctuation.apply(len).mean()

# Calculate the most common words
word_freq = FreqDist([word for text in tweet_text_no_punctuation for word in text])
most_common_words = word_freq.most_common(10)  # Change the number to get more or fewer common words

# Calculate the occurrences of characters
char_occurrences = {char: sum(text.count(char) for text in tweet_text_punctuation) for char in ['?', '!']}  # Add more characters as needed



# Calculate the most common smileys
smileys = [':)', ':D', ':(', ':P']  # Add more smileys as needed
smiley_freq = FreqDist([word for text in tweet_text_punctuation for word in text if word in smileys])
most_common_smileys = smiley_freq.most_common(5)  # Change the number to get more or fewer common smileys

# Calculate the most common multiple smiley occurrences
multiple_smiley_occurrences = {smiley: sum(text.count(smiley * 2) for text in tweet_text_punctuation) for smiley in smileys}

# Print the results
print("Number of words:", num_words)
print("Number of characters:", num_chars)
print("Average length of words:", avg_word_length)
print("Average length of tweet:", avg_text_length)
print("Most common words:", most_common_words)
print("Occurrences of characters:", char_occurrences)
print("Most common smileys:", most_common_smileys)
print("Most common multiple smiley occurrences:", multiple_smiley_occurrences)


Number of words: 97894
Number of characters: 656430
Average length of words: 6.705518213577951
Average length of tweet: 19.5788
Most common words: [('climate', 3370), ('change', 3125), ('Climate', 765), ('ClimateChange', 635), ('climatechange', 615), ('people', 374), ('Change', 361), ('world', 340), ('today', 312), ('action', 303)]
Occurrences of characters: {'?': 603, '!': 1097}
Most common smileys: []
Most common multiple smiley occurrences: {':)': 0, ':D': 0, ':(': 0, ':P': 0}


### tweet specific metrics

In [211]:
# methods:
import emoji
import regex

def extract_emojis(text):
  emoji_list = []
  emojis_iterator = emoji.analyze(text)
  for e in emojis_iterator:
    v = e.value
    emoji_list.append(v.emoji)
    # print(v.emoji)
    # print(e.data) --> access data as alias
  return emoji_list

In [219]:
import emoji

df_results = pd.DataFrame()
df_results['text'] = preprocess_text(tweet_text, False, False)

# get number of exclamation marks in each tweet
df_results["exclamation_marks"] = df_results['text'].apply(lambda text: text.count('!'))

# get number of question marks in each tweet
df_results["question_marks"] = df_results['text'].apply(lambda text: text.count('?'))

# get number of hashtags in each tweet
df_results["hashtags"] = df_results['text'].apply(lambda text: text.count('#'))

# get number of mentions in each tweet
df_results["mentions"] = df_results['text'].apply(lambda text: text.count('@'))

df_results["no_of_emojis"] = df_results['text'].apply(lambda text: emoji.emoji_count(text))

df_results["emojis"] = df_results['text'].apply(lambda text:  [emoji for sublist in extract_emojis(text) for emoji in sublist]) # extract_emojis(text))

df_results["no_of_distinct_emojis"] =  df_results['emojis'].apply(lambda emojis: len(set(emojis)))

df_results.head()

# just for double checking if sth exists
df_results[df_results['no_of_emojis'] !=  df_results['no_of_distinct_emojis']].head()

Unnamed: 0,text,exclamation_marks,question_marks,hashtags,mentions,no_of_emojis,emojis,no_of_distinct_emojis
1500,"[Me, :, Hi, @, BetoORourke, ,, I, ’, m, ..., B...",3,1,3,1,4,"[🚪, 🚪, 🚪, 🚪]",1
1859,"[Canada, is, facing, a, climate, emergency.The...",0,0,1,0,2,"[✅, ✅]",1
2214,"[Justin, Trudeau, is, talking, about, climate,...",0,0,3,0,2,"[➡, ➡]",1
2464,"[We, can, turn, the, tide, on, biodiversity, l...",0,0,1,0,4,"[✅, ✅, 🌳, ✅]",2
2878,"[Today, ,, Gov, ., Cooper, will, testify, befo...",0,0,0,0,2,"[📽, 📽]",1


In [223]:
df_results.iloc[4996]

text                     [🚨, Es, wird, Zeit, zu, handeln, !, München, r...
exclamation_marks                                                        2
question_marks                                                           0
hashtags                                                                 4
mentions                                                                 0
no_of_emojis                                                             2
emojis                                                              [🚨, 🚨]
no_of_distinct_emojis                                                    1
Name: 4996, dtype: object

In [221]:
df_results.iloc[78] # TODO: sometimes emojis are wrongly tokenized and then not recognized as emojis

text                     [Slide, for, thermal, print, ➡️🌡️, ., This, is...
exclamation_marks                                                        0
question_marks                                                           0
hashtags                                                                 0
mentions                                                                 0
no_of_emojis                                                             3
emojis                                                           [💃, 🏺, 🌍]
no_of_distinct_emojis                                                    3
Name: 78, dtype: object

### tests
delete later



In [172]:
import emoji
import regex

def extract_emojis(text):
  emoji_list = []
  # EMOJIS = emoji.EMOJI_DATA
  # data = regex.findall(r'\X', text)
  # flags = regex.findall(u'[\U0001F1E6-\U0001F1FF]', text) 
  # print(flags)
  # for word in data:
  #   if any(char in EMOJIS for char in word):
  #       emoji_list.append(word)
  # return emoji_list + flags
  # emoji package has difficulties with flags


  # emoji_list = [word for word in text.split() if str(word.encode('unicode-escape'))[2] == '\\' ]

  emojis_iterator = emoji.analyze(text)
  for e in emojis_iterator:
    v = e.value
    emoji_list.append(v.emoji)
    #print(v.emoji)
    # print(e.data) --> access data as alias
  # for e in emoji_list:
  #   print(f"{len(e)}  {e}")
  return emoji_list

#  return ''.join(c for c in s if c in EMOJIS)

In [126]:
test_em_string = "🇦🇺 😂 😂 h d 🤟🏾 q 🤟🏻 👆🏻 🏳️‍🌈 🌎 ➡ 🏺 💃 ➡️🌡️ 🎹  👩🏼‍❤️‍💋‍👩🏽  🫃🏻"

In [168]:
print(test_em_string)

🇦🇺 😂 😂 h d 🤟🏾 q 🤟🏻 👆🏻 🏳️‍🌈 🌎 ➡ 🏺 💃 ➡️🌡️ 🎹  👩🏼‍❤️‍💋‍👩🏽  🫃🏻


In [173]:
x = extract_emojis(test_em_string)  # tweet_text[4996])

In [174]:
print(x)

['🇦🇺', '😂', '😂', '🤟🏾', '🤟🏻', '👆🏻', '🏳️\u200d🌈', '🌎', '➡', '🏺', '💃', '➡️', '🌡️', '🎹', '👩🏼\u200d❤️\u200d💋\u200d👩🏽', '\U0001fac3🏻']


In [176]:
emoji.emojize(test_em_string)    

'🇦🇺 😂 😂 h d 🤟🏾 q 🤟🏻 👆🏻 🏳️\u200d🌈 🌎 ➡ 🏺 💃 ➡️🌡️ 🎹  👩🏼\u200d❤️\u200d💋\u200d👩🏽  \U0001fac3🏻'

In [177]:
emoji.emoji_count(test_em_string)

16

In [186]:
len(emoji.distinct_emoji_list(test_em_string))

15

In [84]:
emoji.emoji_list(test_em_string)

[{'match_start': 0, 'match_end': 1, 'emoji': '😂'},
 {'match_start': 2, 'match_end': 3, 'emoji': '😂'},
 {'match_start': 8, 'match_end': 10, 'emoji': '🤟🏾'},
 {'match_start': 13, 'match_end': 15, 'emoji': '🤟🏻'},
 {'match_start': 16, 'match_end': 18, 'emoji': '👆🏻'},
 {'match_start': 19, 'match_end': 21, 'emoji': '🇦🇺'},
 {'match_start': 22, 'match_end': 26, 'emoji': '🏳️\u200d🌈'},
 {'match_start': 27, 'match_end': 28, 'emoji': '🌎'},
 {'match_start': 29, 'match_end': 30, 'emoji': '➡'},
 {'match_start': 31, 'match_end': 32, 'emoji': '🏺'},
 {'match_start': 33, 'match_end': 34, 'emoji': '💃'},
 {'match_start': 35, 'match_end': 37, 'emoji': '➡️'},
 {'match_start': 37, 'match_end': 39, 'emoji': '🌡️'}]

In [86]:
emoji.demojize(test_em_string)

':face_with_tears_of_joy: :face_with_tears_of_joy: h d :love-you_gesture_medium-dark_skin_tone: q :love-you_gesture_light_skin_tone: :backhand_index_pointing_up_light_skin_tone: :Australia: :rainbow_flag: :globe_showing_Americas: :right_arrow: :amphora: :woman_dancing: :right_arrow::thermometer:'

ideas:
- flags as nation bezug
- count of emojis used
- count of distinct emojis used
- add NER
