# Import libraries

In [378]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# Other imports
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt
%matplotlib inline

plt.style.use("fivethirtyeight")

# Data processing

## Read data

In [379]:
import json

with open("../data_05_07.txt", 'r', encoding="UTF-8") as file:
    data_string_05_07 = file.read().replace('\n', '')

with open("../data_07_07.txt", 'r', encoding="UTF-8") as file:
    data_string_07_07 = file.read().replace('\n', '')

with open("../data_09_07.txt", 'r', encoding="UTF-8") as file:
    data_string_09_07 = file.read().replace('\n', '')

data_05_07 = json.loads(data_string_05_07)
data_07_07 = json.loads(data_string_07_07)
data_09_07 = json.loads(data_string_09_07)

data_list = [data_05_07, data_07_07, data_09_07]

In [380]:
# Merge data
pd_data = pd.DataFrame({"created_at": [], "tweet_id": [], "content": [], "author_id": []})

for data_dict in data_list:
    data = data_dict["data"]
    for data_entry in data:
        data_entry_renamed = {"created_at": data_entry["created_at"], "tweet_id": data_entry["id"], "content": data_entry["text"], "author_id": data_entry["author_id"]}
        pd_data = pd_data.append(data_entry_renamed, ignore_index=True)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

In [381]:
pd_data

Unnamed: 0,created_at,tweet_id,content,author_id
0,2022-07-05T18:50:20.000Z,1544393254929661952,Das Pforzheimer Nachtleben bl√ºht wieder auf. V...,806082807562960896
1,2022-07-05T18:31:02.000Z,1544388399817277440,"@Kranzschwinger Indem sie froh snd, dass es ni...",257134702
2,2022-07-05T18:01:17.000Z,1544380912313503746,RT @BNN_BaNeuNa: Hat die gute alte Leitplanke ...,1256684331386515457
3,2022-07-05T17:44:51.000Z,1544376779137327106,"RT @simonschre: Eisenbahnen, die ich heute zum...",2419492255
4,2022-07-05T17:20:08.000Z,1544370557076004866,"Eisenbahnen, die ich heute zum ersten Mal gefa...",1960909934
...,...,...,...,...
295,2022-07-08T09:06:38.000Z,1545333527998353409,Badenser halt! @ Pforzheim Hauptbahnhof https:...,1485182559306891265
296,2022-07-08T09:00:12.000Z,1545331908736950272,"@tuessl @CargoHey Na dann gute Fahrt, weit ist...",239812843
297,2022-07-08T08:45:49.000Z,1545328289572593664,#News #Blaulicht POL-Pforzheim: (Enzkreis) Neu...,1420498158577242116
298,2022-07-08T08:45:37.000Z,1545328237840142338,Werkzaamheden ‚öíÔ∏è in #Duitsland op de #A8 Karls...,259787794


## Data cleaning

### Remove duplicates

In [382]:
pd_data = pd_data.drop_duplicates()
pd_data_original = pd_data.copy() # Will not be changed anymore

### Detect language and filter non german tweets

In [383]:
import langid

tweet_ids_to_remove = []

for index, row in pd_data.iterrows():
    content = row['content']
    content_language = langid.classify(content)[0]
    if content_language != "de":
        tweet_ids_to_remove.append(row["tweet_id"])
        # print(content)

for tweet_id in tweet_ids_to_remove:
    pd_data = pd_data[pd_data["tweet_id"] != tweet_id]

### Clean content

In [384]:
# Lowercase
pd_data["content"] = pd_data["content"].str.lower()

In [385]:
# Stopword source https://raw.githubusercontent.com/solariz/german_stopwords/master/german_stopwords_full.txt
custom_stopwords = []
with open("stopwords.txt", 'r', encoding="UTF-8") as file:
    for line in file:
        custom_stopwords.append(line.replace("\n", ""))

additional_stopwords = ["rt", "fast", "deren", "mehr", "http", "co", "8cn3awvt9v", "eigentlich"]

custom_stopwords = custom_stopwords + additional_stopwords

# Entity analysis

In [386]:
import re

author_dict = {}
hashtag_dict = {}

for index, row in pd_data_original.iterrows():
    # Collect authors, increment occurrences
    this_author_id = row["author_id"]
    if this_author_id not in author_dict:
        author_dict[this_author_id] = 1
    else:
        author_dict[this_author_id] += 1
    # Collect hashtags
    content = row["content"]
    this_hashtags = re.findall(r"#(\w+)", content)
    for this_hashtag in this_hashtags:
        this_hashtag = this_hashtag.lower()
        if this_hashtag not in hashtag_dict:
            hashtag_dict[this_hashtag] = 1
        else:
            hashtag_dict[this_hashtag] += 1

# Sort dictionaries
author_dict = sorted(author_dict.items(), key=lambda x: x[1], reverse=True)
hashtag_dict = sorted(hashtag_dict.items(), key=lambda x: x[1], reverse=True)

In [387]:
author_dict

[('1420498158577242116', 22),
 ('1184022676488314880', 10),
 ('24165538', 10),
 ('806082807562960896', 7),
 ('1523376024876167172', 5),
 ('747113754006065152', 4),
 ('105591210', 4),
 ('1305951228539658240', 3),
 ('936719821064097797', 3),
 ('1411270748266807296', 3),
 ('1525082147501092865', 3),
 ('878964397103775744', 3),
 ('2419492255', 2),
 ('1960909934', 2),
 ('1202872538281467904', 2),
 ('123258168', 2),
 ('36973702', 2),
 ('18119691', 2),
 ('1078912669', 2),
 ('3049086521', 2),
 ('433223957', 2),
 ('259787794', 2),
 ('973940217508958208', 2),
 ('1375959707689615364', 2),
 ('1500031172491026432', 2),
 ('1534550535596023808', 2),
 ('4185773171', 2),
 ('1361415758', 2),
 ('898620970012991488', 2),
 ('1118474756590796800', 2),
 ('1485182559306891265', 2),
 ('257134702', 1),
 ('1256684331386515457', 1),
 ('15929362', 1),
 ('1209154096785174528', 1),
 ('826575192738656257', 1),
 ('1463295105108885506', 1),
 ('1483419150739517440', 1),
 ('390273160', 1),
 ('718066051280465920', 1),
 ('

In [388]:
hashtag_dict

[('pforzheim', 58),
 ('news', 22),
 ('blaulicht', 22),
 ('badenw√ºrttemberg', 7),
 ('pforzheimgram', 6),
 ('u5', 5),
 ('hypromag', 5),
 ('rareearths', 5),
 ('a8', 4),
 ('freiheit', 4),
 ('karlsruhe', 3),
 ('schwarzwald', 3),
 ('job', 3),
 ('cemdogan', 2),
 ('diamonds', 2),
 ('jewelry', 2),
 ('paulinakurka', 2),
 ('phillipdeml', 2),
 ('startups', 2),
 ('training', 2),
 ('schule', 2),
 ('kampfkunst', 2),
 ('stellenangebot', 2),
 ('feuerwehr', 2),
 ('magdeburg', 2),
 ('stanztechnik', 2),
 ('uffbasse', 2),
 ('zeugengesucht', 2),
 ('wiesenbach', 2),
 ('enzkreis', 2),
 ('trending', 2),
 ('duitsland', 2),
 ('nagold', 2),
 ('dbregio_bw', 2),
 ('karrieremodus', 2),
 ('stuttgart', 2),
 ('polizeipforzheim', 1),
 ('exhbitionistpforzheim', 1),
 ('entertainment', 1),
 ('sports', 1),
 ('school', 1),
 ('cnc', 1),
 ('einrichter', 1),
 ('√§', 1),
 ('resilienz', 1),
 ('unternehmen', 1),
 ('ressourcen', 1),
 ('ihk', 1),
 ('weingarte', 1),
 ('weingarten', 1),
 ('usa', 1),
 ('tuerkei', 1),
 ('tuerkiye', 1),

# Topic analysis - get the 5 most prominent topics

### Tokenize, stopword filtering and lemmatize

In [389]:
from DataAnalyticsIU.helper_functions import LemmaCountVectorizer
from nltk.corpus import stopwords

content_text_list = list(pd_data.content.values)

# german_stop_words = frozenset(custom_stopwords)
german_stop_words = stopwords.words('german')
german_stop_words = german_stop_words + custom_stopwords

# Min_df is chosen to exclude words with very low frequency
tf_vectorizer = LemmaCountVectorizer(min_df=2,
                                     stop_words=german_stop_words,
                                     decode_error='ignore')
tf = tf_vectorizer.fit_transform(content_text_list)


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['dat', 'inf', 'zb'] not in stop_words.



In [390]:
# Visualize word occurrences after preprocessing

feature_names = tf_vectorizer.get_feature_names()
count_vec = np.asarray(tf.sum(axis=0)).ravel()
zipped = list(zip(feature_names, count_vec))
x, y = (list(x) for x in zip(*sorted(zipped, key=lambda x: x[1], reverse=True)))
# Now I want to extract out on the top 15 and bottom 15 words
Y = np.concatenate([y[0:15], y[-16:-1]])
X = np.concatenate([x[0:15], x[-16:-1]])

# Plotting the Plot.ly plot for the Top 50 word frequencies
data = [go.Bar(
            x = x[0:50],
            y = y[0:50],
            marker= dict(colorscale='Jet',
                         color = y[0:50]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 Word frequencies after Preprocessing'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



### Implement Latent Dirichlet Allocation

In [391]:
# 5 Topics as specified in the assignment
lda = LatentDirichletAllocation(n_components=5, max_iter=10,
                                learning_method = 'online',
                                learning_offset = 10,
                                random_state = 6432143)

In [392]:
lda.fit(tf)

In [393]:
from DataAnalyticsIU.helper_functions import print_top_words

n_top_words = 40
print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model: 

Topic #0:http pforzheim 2022 pforzheimgram uhr pforzheimer a8 07 badenw√ºrttemberg informationen ausgesucht url woche pznews zeitung u5 trending doxograf betonteilen br√ºcken freiheit panne passt 10 natenom region hs_pforzheim offlabelu5 u12schutz booster nummer job 18 impfen 16 boost 09 dienstag polizei 12

Topic #1:pforzheim http schwarzwald wandern westweg haft verantwortlichen abschiebeknast bw berlin verhindert schwert werd antira_netzwerk gelesen nw4bbqloam unterstmatt sch√∂n klausuraufsichten uneigentlich woche training kampfkunst sport schule hut fecht michael_karalus corona lebe ausreichend gl√ºck karlsruhe sucht lernen wartezeiten twittertreffen fragen entstehen kleinleise

Topic #2:fragen entstehen twittertreffen wartezeiten kleinleise stadt_pforzheim polizeipf earlytwix liga natenom fifa22 karrieremodus uffc j3adilwkef karlsruhe eutingen niefern hw_ba gastgeber fu√üball frau teamplay polizeilandesmeisterschaft fitness zust√§ndigkeitsbereich 45 grad b