#### This script can be used to calculate statistics and characteristics of #hashtags and words used in the dataset built for my BAC2

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#import emoji
import re
from collections import Counter
import itertools
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import string
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to /home/mhecht/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mhecht/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Function definitions

In [2]:
def extract_hashtags(text):
    hashtag_list = []
    for tweets in text:
        for word in tweets.split():
            if word[0] == '#':
                if len(word[1:].split('#')) > 1:
                    for w in word[1:].split('#'): 
                        if w != '#':
                            hashtag_list.append(re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', w).strip('.,"').lower()) 
                else:
                    hashtag_list.append(re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', word[1:]).strip('.,"').lower())#word[1:])
    return hashtag_list

In [3]:
def extract_words(text):
    word_list = []
    for tweets in text:
        # removing urls
        tweets = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', tweets.lower())
        # manually remove following punctuation
        tweets = re.sub(r'’|”|‘|“', ' ',tweets)
        # removing punctiation
        tweets = tweets.translate(str.maketrans('','',string.punctuation))        
        # tokenize
        text_tokens = word_tokenize(tweets)
        tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('german')]
        for word in tokens_without_sw:
            # adding the word to the list
            word_list.append(word)
    return word_list

In [4]:
# get_frequencies: used to determine frequencies of words in the corpus occuring in the dataset 
def get_frequencies(corpus, words):
    freq_dict = {}
    for word in corpus:
        freq_dict[word] = words.get(word, 0)
    return freq_dict        

In [5]:
# sort_dict: sort dictionary according to the values
def sort_dict(dictionary, reverse=False):
    sorted_tuples = sorted(dictionary.items(), key=lambda item: item[1], reverse=reverse)
    sorted_dict = {k: v for k, v in sorted_tuples}
    return sorted_dict

#### Data import

In [12]:
import os
print(os.getcwd())
data_path = 'final_dataset.csv'
data = pd.read_csv(data_path, sep=',', encoding='utf-8')
data.head()

/home/workdrive/mhecht/hatespeech


Unnamed: 0.1,Unnamed: 0,text,hate,dataset
0,0,"Meine Mutter hat mir erzählt, dass mein Vater ...",0,germeval2018
1,1,@Tom174_ @davidbest95 Meine Reaktion; |LBR| Ni...,0,germeval2018
2,2,"#Merkel rollt dem Emir von #Katar, der islamis...",0,germeval2018
3,3,„Merle ist kein junges unschuldiges Mädchen“ K...,0,germeval2018
4,4,@umweltundaktiv Asylantenflut bringt eben nur ...,1,germeval2018


#### Get Sum of each class

In [13]:
print((data.hate == 0).sum())
print((data.hate == 1).sum())


17282
27849


#### Create Wordcloud

In [10]:
text = data['hate' == 1])
# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, max_words=10000, background_color="white").generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

SyntaxError: invalid syntax (1970379109.py, line 1)

#### Loop over all tweets to generate dictionaries containing hashtags and all words (excluding stopwords and urls)

In [11]:
all_hashtags = []
tasks =[]
all_words = []


#print(label)
text = data['text']
hashtags = extract_hashtags(text)

#print(hashtags)
wordcloud = WordCloud(max_font_size=50, max_words=10000, background_color="white").generate(str(hashtags))
#plt.imshow(wordcloud, interpolation='bilinear')
#plt.axis("off")
#plt.show()
hashtags_dict = {}
for c in range(len(hashtags)):
    hashtags_dict[hashtags[c]] = hashtags.count(hashtags[c])

sorted_hashtags_dict = sort_dict(hashtags_dict, reverse=True)

df = pd.DataFrame()
df = df.from_dict(sorted_hashtags_dict,orient='index')
df[:10]
#print("\n")

#all words
# words = extract_words(text)
# words_dict = {}
# for c in range(len(words)):
#     words_dict[words[c]] = words.count(words[c])

# sorted_words_dict = sort_dict(words_dict, reverse=True)

# #all_hashtags.append(sorted_hashtags_dict)
# tasks.append(label)
# all_words.append(sorted_words_dict)


Unnamed: 0,0
afd,874
merkel,552
spd,504
cdu,320
deutschland,290
berlin,204
asylanten,163
bundestag,158
flüchtlinge,143
fcsp,132


## Hashtags
#### Plots: all hashtags in tweets

In [None]:
all_hashtags

In [None]:
n = 40
for x in range(len(all_hashtags)):
    mylist = [key for key, val in dict(itertools.islice(all_hashtags[x].items(), n)).items() for _ in range(val)]
    plt.hist(mylist, np.arange(n+1)-0.5)
    plt.xticks(rotation=90)
    plt.title("language: {}; class: {}".format(tasks[x][0],tasks[x][1]))
    plt.show()


## All Words
#### Plots: all words in tweets

In [None]:
n = 40
for x in range(len(all_words)):
    mylist = [key for key, val in dict(itertools.islice(all_words[x].items(), 0, n)).items() for _ in range(val)]
    plt.hist(mylist, np.arange(n+1)-0.5)
    plt.xticks(rotation=90)
    plt.title("language: {}; class: {}".format(tasks[x][0],tasks[x][1]))
    plt.show()
