# Import libraries

In [428]:
import pandas as pd
import numpy as np
%matplotlib inline
# Plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Other imports
from sklearn.decomposition import LatentDirichletAllocation
from matplotlib import pyplot as plt
%matplotlib inline

plt.style.use("fivethirtyeight")

# Data processing

## Read data

In [429]:
import json
import os

data_list = []

for file in os.listdir("../"):
    if file.endswith(".txt"):
        with open("../" + file, "r", encoding="UTF-8") as f:
            data_string = f.read().replace("\n", "")
            loaded_json = json.loads(data_string)
            data_list.append(loaded_json)

In [430]:
# Merge data
pd_data = pd.DataFrame({"created_at": [], "tweet_id": [], "content": [], "author_id": []})

for data_dict in data_list:
    data = data_dict["data"]
    for data_entry in data:
        data_entry_renamed = {"created_at": data_entry["created_at"], "tweet_id": data_entry["id"], "content": data_entry["text"], "author_id": data_entry["author_id"]}
        pd_data = pd_data.append(data_entry_renamed, ignore_index=True)

In [431]:
pd_data

Unnamed: 0,created_at,tweet_id,content,author_id
0,2022-07-05T18:50:20.000Z,1544393254929661952,Das Pforzheimer Nachtleben blüht wieder auf. V...,806082807562960896
1,2022-07-05T18:31:02.000Z,1544388399817277440,"@Kranzschwinger Indem sie froh snd, dass es ni...",257134702
2,2022-07-05T18:01:17.000Z,1544380912313503746,RT @BNN_BaNeuNa: Hat die gute alte Leitplanke ...,1256684331386515457
3,2022-07-05T17:44:51.000Z,1544376779137327106,"RT @simonschre: Eisenbahnen, die ich heute zum...",2419492255
4,2022-07-05T17:20:08.000Z,1544370557076004866,"Eisenbahnen, die ich heute zum ersten Mal gefa...",1960909934
...,...,...,...,...
395,2022-07-09T05:52:29.000Z,1545647057087651841,RT @wayfu: @Ethylendiamin vermute Essen &gt; D...,281413387
396,2022-07-09T05:50:55.000Z,1545646660004495360,RT @wayfu: @Ethylendiamin vermute Essen &gt; D...,1478025287245967365
397,2022-07-09T05:50:49.000Z,1545646637585932288,RT @wayfu: @Ethylendiamin vermute Essen &gt; D...,531325229
398,2022-07-09T05:49:15.000Z,1545646240372768768,RT @kleinleise: Heute kommen etliche von Ihnen...,756611547825377280


## Data cleaning

### Remove duplicates

In [432]:
pd_data = pd_data.drop_duplicates()
pd_data_original = pd_data.copy() # Will not be changed anymore

### Detect language and filter non german tweets, also remove http links

In [433]:
import re
import langid

tweet_ids_to_remove = []

for index, row in pd_data.iterrows():
    content = row["content"]
    content_language = langid.classify(content)[0]
    # Remove non german tweets
    if content_language != "de":
        tweet_ids_to_remove.append(row["tweet_id"])
    # Remove http links because removing them through stopwords doesn't work
    new_content = re.sub(r"\bhttp\w+", "", content)
    pd_data.loc[index]["content"] = new_content

for tweet_id in tweet_ids_to_remove:
    pd_data = pd_data[pd_data["tweet_id"] != tweet_id]

### Clean content

In [434]:
# Lowercase
pd_data["content"] = pd_data["content"].str.lower()

In [435]:
# Stopword source https://raw.githubusercontent.com/solariz/german_stopwords/master/german_stopwords_full.txt
custom_stopwords = []
with open("stopwords.txt", 'r', encoding="UTF-8") as file:
    for line in file:
        custom_stopwords.append(line.replace("\n", ""))

additional_stopwords = ["rt", "fast", "deren", "mehr", "http", "co", "8cn3awvt9v", "eigentlich", "j3adilwkef"]
# Remove Pforzheim because it is obvious that all tweets should be about Pforzheim
additional_stopwords.append("pforzheim")
additional_stopwords.append("pf")
additional_stopwords.append("pforzheimer")

custom_stopwords = custom_stopwords + additional_stopwords

# Entity analysis

In [436]:
import re

author_dict = {}
hashtag_dict = {}

for index, row in pd_data_original.iterrows():
    # Collect authors, increment occurrences
    this_author_id = row["author_id"]
    if this_author_id not in author_dict:
        author_dict[this_author_id] = 1
    else:
        author_dict[this_author_id] += 1
    # Collect hashtags
    content = row["content"]
    this_hashtags = re.findall(r"#(\w+)", content)
    for this_hashtag in this_hashtags:
        this_hashtag = this_hashtag.lower()
        if this_hashtag not in hashtag_dict:
            hashtag_dict[this_hashtag] = 1
        else:
            hashtag_dict[this_hashtag] += 1

# Sort dictionaries
author_dict = sorted(author_dict.items(), key=lambda x: x[1], reverse=True)
hashtag_dict = sorted(hashtag_dict.items(), key=lambda x: x[1], reverse=True)

In [437]:
author_dict

[('1420498158577242116', 22),
 ('24165538', 14),
 ('1184022676488314880', 11),
 ('806082807562960896', 7),
 ('747113754006065152', 7),
 ('1523376024876167172', 5),
 ('936719821064097797', 5),
 ('1305951228539658240', 4),
 ('1411270748266807296', 4),
 ('105591210', 4),
 ('878964397103775744', 4),
 ('2419492255', 3),
 ('123258168', 3),
 ('36973702', 3),
 ('356565974', 3),
 ('1375959707689615364', 3),
 ('1525082147501092865', 3),
 ('1118474756590796800', 3),
 ('1960909934', 2),
 ('15929362', 2),
 ('1427678298352541699', 2),
 ('1202872538281467904', 2),
 ('15170704', 2),
 ('1041350253713088512', 2),
 ('18119691', 2),
 ('1078912669', 2),
 ('3049086521', 2),
 ('433223957', 2),
 ('259787794', 2),
 ('973940217508958208', 2),
 ('1500031172491026432', 2),
 ('1534550535596023808', 2),
 ('4185773171', 2),
 ('1361415758', 2),
 ('898620970012991488', 2),
 ('702483013750366208', 2),
 ('2606233777', 2),
 ('1504855210648391684', 2),
 ('79946862', 2),
 ('1485182559306891265', 2),
 ('902085877', 2),
 ('7

In [438]:
hashtag_dict

[('pforzheim', 78),
 ('news', 23),
 ('blaulicht', 22),
 ('badenwürttemberg', 11),
 ('pforzheimgram', 10),
 ('karlsruhe', 6),
 ('schwarzwald', 6),
 ('a8', 6),
 ('u5', 5),
 ('hypromag', 5),
 ('rareearths', 5),
 ('job', 4),
 ('freiheit', 4),
 ('erdbeben', 4),
 ('training', 3),
 ('schule', 3),
 ('kampfkunst', 3),
 ('karrieremodus', 3),
 ('stuttgart', 3),
 ('mustread', 3),
 ('cemdogan', 2),
 ('diamonds', 2),
 ('jewelry', 2),
 ('paulinakurka', 2),
 ('phillipdeml', 2),
 ('startups', 2),
 ('sports', 2),
 ('school', 2),
 ('stellenangebot', 2),
 ('feuerwehr', 2),
 ('magdeburg', 2),
 ('stanztechnik', 2),
 ('uffbasse', 2),
 ('zeugengesucht', 2),
 ('wiesenbach', 2),
 ('fahrradmordor', 2),
 ('enzkreis', 2),
 ('trending', 2),
 ('duitsland', 2),
 ('nagold', 2),
 ('dbregio_bw', 2),
 ('fifa22', 2),
 ('stz', 2),
 ('pznews', 2),
 ('9euroticket', 2),
 ('fantasy', 2),
 ('fiction', 2),
 ('amazon', 2),
 ('kindle', 2),
 ('historicalfantasy', 2),
 ('look4books', 2),
 ('books', 2),
 ('paperbacks', 2),
 ('paperba

# Topic analysis - get the 5 most prominent topics

### Tokenize, stopword filtering and lemmatize

In [439]:
from DataAnalyticsIU.helper_functions import LemmaCountVectorizer
from nltk.corpus import stopwords

content_text_list = list(pd_data.content.values)

# german_stop_words = frozenset(custom_stopwords)
german_stop_words = stopwords.words('german')
german_stop_words = german_stop_words + custom_stopwords

# Min_df is chosen to exclude words with very low frequency
tf_vectorizer = LemmaCountVectorizer(min_df=2,
                                     stop_words=german_stop_words,
                                     decode_error='ignore')
data_vectorized = tf_vectorizer.fit_transform(content_text_list)


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['dat', 'inf', 'zb'] not in stop_words.



In [440]:
# Visualize word occurrences after preprocessing
feature_names = tf_vectorizer.get_feature_names()
count_vec = np.asarray(data_vectorized.sum(axis=0)).ravel()
zipped = list(zip(feature_names, count_vec))
x, y = (list(x) for x in zip(*sorted(zipped, key=lambda x: x[1], reverse=True)))
# Now I want to extract out on the top 15 and bottom 15 words
Y = np.concatenate([y[0:15], y[-16:-1]])
X = np.concatenate([x[0:15], x[-16:-1]])

# Plotting the Plot.ly plot for the Top 50 word frequencies
data = [go.Bar(
            x = x[0:50],
            y = y[0:50],
            marker= dict(colorscale='Jet',
                         color = y[0:50]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 Word frequencies after Preprocessing'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



### Implement Latent Dirichlet Allocation

In [441]:
from sklearn.model_selection import GridSearchCV

# Do GridSearch to find the best possible n_components parameter

search_params = {'n_components': [5, 7, 9, 10], 'learning_decay': [.5, .7, .9], 'learning_offset': [1, 3, 5, 7, 9, 10], 'max_iter': [5, 7, 10]}

# Init the Model
lda = LatentDirichletAllocation()

# Comment out to improve performance
# Init Grid Search Class
#model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
#model.fit(data_vectorized)

# Best Model
#best_lda_model = model.best_estimator_

# Model Parameters
#print("Best Model's Params: ", model.best_params_)

In [442]:
# Implement algorithm with best parameters
lda = LatentDirichletAllocation(n_components=5, max_iter=7,
                                learning_method = 'online',
                                learning_offset = 3,
                                learning_decay=0.5,
                                random_state = 6432143)

In [443]:
lda.fit(data_vectorized)

In [444]:
from DataAnalyticsIU.helper_functions import print_top_words

n_top_words = 40
print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model: 

Topic #0:kleinleise fragen wartezeiten entstehen twittertreffen 07 22 info bad erdbeben nähere warten beruhigt eingebildet 16 2022 uhr woche pfbits u5 dienstag 10 samstag 12 09 boost 18 impfen 05 wildbad dienersebastian gefahren systemwechselstelle kassel lexiikon zuckerfresse liberalochris trending grad uneigentlich

Topic #1:news pol blaulicht enzkreis haus gesucht job aktuell gt karlsruhe zeugen darmstadt calw bewerben km booster polizeipf polizei richtung nix gefahren impfstelle kreis stadt_pforzheim gewackelt wind erdbeben cw einbruch stadt nagold 27 gewahrsam essen fds amp natenom körperverletzung firma earlytwix

Topic #2:erdbeben gewackelt brückenbau stuttgart radfahrer dauert 9euroticket sinn gespürt nachrichten doxograf fahrradmordor ecke natenom freiheit paperback liga juli haft bw verantwortlichen verhindert berlin abschiebeknast frau fifa22 karrieremodus uffc book mustread polizei werd antira_netzwerk exhibitionist belästigt hbf trending verzweifelt