# Import libraries

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
# Plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Other imports
from sklearn.decomposition import LatentDirichletAllocation
from matplotlib import pyplot as plt
%matplotlib inline

plt.style.use("fivethirtyeight")

# Data processing

## Read data

In [2]:
import json
import os

data_list = []

for file in os.listdir("../"):
    if file.endswith(".txt"):
        with open("../" + file, "r", encoding="UTF-8") as f:
            data_string = f.read().replace("\n", "")
            loaded_json = json.loads(data_string)
            data_list.append(loaded_json)

In [3]:
# Merge data
pd_data = pd.DataFrame({"created_at": [], "tweet_id": [], "content": [], "author_id": []})

for data_dict in data_list:
    data = data_dict["data"]
    for data_entry in data:
        data_entry_renamed = {"created_at": data_entry["created_at"], "tweet_id": data_entry["id"], "content": data_entry["text"], "author_id": data_entry["author_id"]}
        pd_data = pd_data.append(data_entry_renamed, ignore_index=True)

In [4]:
pd_data

Unnamed: 0,created_at,tweet_id,content,author_id
0,2022-07-05T18:50:20.000Z,1544393254929661952,Das Pforzheimer Nachtleben blüht wieder auf. V...,806082807562960896
1,2022-07-05T18:31:02.000Z,1544388399817277440,"@Kranzschwinger Indem sie froh snd, dass es ni...",257134702
2,2022-07-05T18:01:17.000Z,1544380912313503746,RT @BNN_BaNeuNa: Hat die gute alte Leitplanke ...,1256684331386515457
3,2022-07-05T17:44:51.000Z,1544376779137327106,"RT @simonschre: Eisenbahnen, die ich heute zum...",2419492255
4,2022-07-05T17:20:08.000Z,1544370557076004866,"Eisenbahnen, die ich heute zum ersten Mal gefa...",1960909934
...,...,...,...,...
495,2022-07-10T09:58:04.000Z,1546071248156909571,@pznews Die Schlossberghöfe in Pforzheim spalt...,356565974
496,2022-07-10T09:49:29.000Z,1546069086790189056,@pznews Die Schlossberghöfe in Pforzheim spalt...,356565974
497,2022-07-10T09:42:38.000Z,1546067360875937793,Alle Informationen unter dieser Url: https://t...,936719821064097797
498,2022-07-10T09:38:37.000Z,1546066351294488576,Vom 14. - 16. Juli 2022 findet die inzwischen ...,3235981942


## Data cleaning

### Remove duplicates

In [5]:
pd_data = pd_data.drop_duplicates()
pd_data_original = pd_data.copy() # Will not be changed anymore

### Detect language and filter non german tweets, also remove http links

In [6]:
import re
import langid

tweet_ids_to_remove = []

for index, row in pd_data.iterrows():
    content = row["content"]
    content_language = langid.classify(content)[0]
    # Remove non german tweets
    if content_language != "de":
        tweet_ids_to_remove.append(row["tweet_id"])
    # Remove http links because removing them through stopwords doesn't work
    new_content = re.sub(r"\bhttp\w+", "", content)
    pd_data.loc[index]["content"] = new_content

for tweet_id in tweet_ids_to_remove:
    pd_data = pd_data[pd_data["tweet_id"] != tweet_id]

### Clean content

In [7]:
# Lowercase
pd_data["content"] = pd_data["content"].str.lower()

In [8]:
# Stopword source https://raw.githubusercontent.com/solariz/german_stopwords/master/german_stopwords_full.txt
custom_stopwords = []
with open("stopwords.txt", 'r', encoding="UTF-8") as file:
    for line in file:
        custom_stopwords.append(line.replace("\n", ""))

additional_stopwords = ["rt", "fast", "deren", "mehr", "http", "co", "8cn3awvt9v", "eigentlich", "j3adilwkef"]
# Remove Pforzheim because it is obvious that all tweets should be about Pforzheim
additional_stopwords.append("pforzheim")
additional_stopwords.append("pf")
additional_stopwords.append("pforzheimer")

custom_stopwords = custom_stopwords + additional_stopwords

# Entity analysis

In [9]:
import re

author_dict = {}
hashtag_dict = {}

for index, row in pd_data_original.iterrows():
    # Collect authors, increment occurrences
    this_author_id = row["author_id"]
    if this_author_id not in author_dict:
        author_dict[this_author_id] = 1
    else:
        author_dict[this_author_id] += 1
    # Collect hashtags
    content = row["content"]
    this_hashtags = re.findall(r"#(\w+)", content)
    for this_hashtag in this_hashtags:
        this_hashtag = this_hashtag.lower()
        if this_hashtag not in hashtag_dict:
            hashtag_dict[this_hashtag] = 1
        else:
            hashtag_dict[this_hashtag] += 1

# Sort dictionaries
author_dict = sorted(author_dict.items(), key=lambda x: x[1], reverse=True)
hashtag_dict = sorted(hashtag_dict.items(), key=lambda x: x[1], reverse=True)

In [10]:
author_dict

[('1420498158577242116', 33),
 ('24165538', 16),
 ('1184022676488314880', 11),
 ('747113754006065152', 8),
 ('806082807562960896', 7),
 ('1523376024876167172', 7),
 ('936719821064097797', 6),
 ('105591210', 6),
 ('1305951228539658240', 5),
 ('1375959707689615364', 5),
 ('1118474756590796800', 5),
 ('2419492255', 4),
 ('123258168', 4),
 ('1411270748266807296', 4),
 ('878964397103775744', 4),
 ('872760537284833280', 4),
 ('1520024378016206848', 3),
 ('36973702', 3),
 ('15170704', 3),
 ('356565974', 3),
 ('1525082147501092865', 3),
 ('19447047', 3),
 ('1485182559306891265', 3),
 ('902085877', 3),
 ('1960909934', 2),
 ('15929362', 2),
 ('1427678298352541699', 2),
 ('1202872538281467904', 2),
 ('1041350253713088512', 2),
 ('18119691', 2),
 ('1078912669', 2),
 ('1160220612033310720', 2),
 ('3049086521', 2),
 ('1489157743860723718', 2),
 ('433223957', 2),
 ('259787794', 2),
 ('973940217508958208', 2),
 ('1500031172491026432', 2),
 ('1534550535596023808', 2),
 ('4185773171', 2),
 ('1361415758'

In [11]:
hashtag_dict

[('pforzheim', 87),
 ('news', 34),
 ('blaulicht', 33),
 ('badenwürttemberg', 12),
 ('pforzheimgram', 11),
 ('schwarzwald', 7),
 ('karlsruhe', 6),
 ('a8', 6),
 ('hypromag', 6),
 ('rareearths', 6),
 ('u5', 5),
 ('training', 4),
 ('schule', 4),
 ('kampfkunst', 4),
 ('job', 4),
 ('fahrradmordor', 4),
 ('fifa22', 4),
 ('karrieremodus', 4),
 ('freiheit', 4),
 ('erdbeben', 4),
 ('stanztechnik', 3),
 ('stuttgart', 3),
 ('mustread', 3),
 ('cemdogan', 2),
 ('diamonds', 2),
 ('jewelry', 2),
 ('paulinakurka', 2),
 ('phillipdeml', 2),
 ('startups', 2),
 ('sports', 2),
 ('school', 2),
 ('stellenangebot', 2),
 ('feuerwehr', 2),
 ('magdeburg', 2),
 ('nowplaying', 2),
 ('uffbasse', 2),
 ('zeugengesucht', 2),
 ('wiesenbach', 2),
 ('kleinergmbh', 2),
 ('kleiner', 2),
 ('enzkreis', 2),
 ('trending', 2),
 ('duitsland', 2),
 ('nagold', 2),
 ('dbregio_bw', 2),
 ('selfdefense', 2),
 ('schwert', 2),
 ('bartitsu', 2),
 ('selbstverteidigung', 2),
 ('workshops', 2),
 ('sport', 2),
 ('stz', 2),
 ('pznews', 2),
 ('

# Topic analysis - get the 5 most prominent topics

### Tokenize, stopword filtering and lemmatize

In [12]:
from DataAnalyticsIU.helper_functions import LemmaCountVectorizer
from nltk.corpus import stopwords

content_text_list = list(pd_data.content.values)

# german_stop_words = frozenset(custom_stopwords)
german_stop_words = stopwords.words('german')
german_stop_words = german_stop_words + custom_stopwords

# Min_df is chosen to exclude words with very low frequency
tf_vectorizer = LemmaCountVectorizer(min_df=2,
                                     stop_words=german_stop_words,
                                     decode_error='ignore')
data_vectorized = tf_vectorizer.fit_transform(content_text_list)


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['dat', 'inf', 'zb'] not in stop_words.



frozenset({'die', 'oberen', 'überaus', 'insgesamter', 'ausgenommener', 'dieses', 'deine', 'duerftet', 'vollständigen', 'etliche', 'auf', 'immer', 'durftest', 'befragter', 'euren', 'erscheinen', 'gleichem', 'am', 'deinem', 'gleiches', 'bevor', 'voelligen', 'womoeglich', 'neulich', 'such', 'off', 'persoenlich', 'allg.', 'waehrend', 'aeusserst', 'folgend', 'bislang', 'gängige', 'aehnlich', 'berichtet', 'manches', 'reagierte', 'weiterhin', 'werdet', 'ganzes', 'jeglicher', 'dazwischen', 'gruendlich', 'koennt', 'mochte', 'fortsetzt', 'startet', 'letztens', 'diesseitiger', 'unsaegliche', 'vorbei', 'hätten', 'ungleiches', 'mancher', 'woraus', 'pforzheimer', 'anderweitiges', 'mitten', 'moeglichste', 'besseren', 'eigenst', 'möglichsten', 'eine', 'entsprechende', 'genug', 'bestimmte', 'deines', 'öfter', 'vollstaendig', 'aeussersten', 'sagten', 'häufigeren', 'je', 'geteilte', 'ehestes', 'erstere', 'ihretwegen', 'geworden', 'woran', 'wievieler', 'ans', 'besser', 'zur', 'ich', 'dank', 'voellig', 'du

In [13]:
# Visualize word occurrences after preprocessing
feature_names = tf_vectorizer.get_feature_names()
count_vec = np.asarray(data_vectorized.sum(axis=0)).ravel()
zipped = list(zip(feature_names, count_vec))
x, y = (list(x) for x in zip(*sorted(zipped, key=lambda x: x[1], reverse=True)))
# Now I want to extract out on the top 15 and bottom 15 words
Y = np.concatenate([y[0:15], y[-16:-1]])
X = np.concatenate([x[0:15], x[-16:-1]])

# Plotting the Plot.ly plot for the Top 50 word frequencies
data = [go.Bar(
            x = x[0:50],
            y = y[0:50],
            marker= dict(colorscale='Jet',
                         color = y[0:50]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 Word frequencies after Preprocessing'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



### Implement Latent Dirichlet Allocation

In [14]:
from sklearn.model_selection import GridSearchCV

# Do GridSearch to find the best possible n_components parameter

search_params = {'n_components': [5, 7, 9, 10], 'learning_decay': [.5, .7, .9], 'learning_offset': [1, 3, 5, 7, 9, 10], 'max_iter': [5, 7, 10]}

# Init the Model
lda = LatentDirichletAllocation()

# Comment out to improve performance
# Init Grid Search Class
#model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
#model.fit(data_vectorized)

# Best Model
#best_lda_model = model.best_estimator_

# Model Parameters
#print("Best Model's Params: ", model.best_params_)

In [15]:
# Implement algorithm with best parameters
lda = LatentDirichletAllocation(n_components=5, max_iter=7,
                                learning_method = 'online',
                                learning_offset = 3,
                                learning_decay=0.5,
                                random_state = 6432143)

In [16]:
lda.fit(data_vectorized)

In [17]:
from DataAnalyticsIU.helper_functions import print_top_words

n_top_words = 40
print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model: 

Topic #0:pol news blaulicht enzkreis fds zeugen gesucht verletzt cw schwer einbruch haus unbekannter fahrradmordor verkehrsunfall bw tiefenbronn stunde fahrradfahrer auto stück booster u12schutz nummer offlabelu5 karlsruhe mehrfamilienhaus verursacht csd polizei km natenom berlin firma haft verantwortlichen abschiebeknast verhindert freudenstadt baiersbronn

Topic #1:pforzheimgram darmstadt badenwürttemberg essen url ausgesucht informationen schwarzwald gt a8 pznews karlsruhe schwert polizeipf stadt_pforzheim ste7130 natenom zeitung hs_pforzheim polizei kampfkunst fecht sport training hut schule fehler kurz paperback wandern hipsterl0s brücke impfstelle heilbronn earlytwix frau booster verdauungstrasse michael_karalus vermute

Topic #2:kleinleise twittertreffen entstehen wartezeiten fragen 2022 uhr 07 juli uffc hbf fifa22 16 liga club create 18 05 12 samstag karrieremodus 10 u5 aufstieg stehen amp woche dienstag boost impfen 09 angebote kulinarische nagold lasse


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

