# Data Preparation

Inhaltsverzeichnis

In [43]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
# 0. Lade Datensatz

from src import utils
import pandas as pd

df = utils.load_pkl(path='../data/raw/twitter_tweets_raw.pkl')[0]

## Data Cleaning

In [45]:
# 1. L√∂sche alle doppelten Texte
df.drop_duplicates(subset=['rawContent'], inplace=True)

# Pr√ºfe auf Erfolg
if df['rawContent'].duplicated().any():
    print(f"{len(df[df['rawContent'].duplicated()])} Duplikate gefunden.")

In [46]:
# 2. L√∂sche alle nicht-englischen Beitr√§ge

non_english_posts = df.query('lang != "en"')
df.drop(index=non_english_posts.index, inplace=True)

# Pr√ºfe auf Erfolg
if not df['lang'].eq('en').all():
    print(df.query('lang != "en"'))

In [47]:
# 3. Date aktualisieren

df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None)

In [48]:
# 4. L√∂sche irrelevante 
df.drop(columns=['renderedContent', 'id', 'user', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount', 
                 'conversationId', 'lang', 'source', 'sourceUrl', 'sourceLabel', 'links', 'media', 
                 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId', 'inReplyToUser', 'mentionedUsers', 
                 'coordinates', 'place', 'hashtags', 'cashtags', 'card', 'viewCount', 'vibe'], inplace=True)

df = df[['date', 'rawContent', 'url']]

In [49]:
df.set_index('url', inplace=True)
df.reset_index(inplace=True)
df.to_feather('../data/intermediate/twitter_tweets_intermediate.feather')

---

## Preprocessing Pipeline

In [50]:
from src import utils
import pandas as pd

# load dataframe
df = pd.read_feather('../data/intermediate/twitter_tweets_intermediate.feather')
df.head(4)

Unnamed: 0,url,date,rawContent
0,https://twitter.com/YueDongCS/status/164159107...,2023-03-30 23:59:46,My condolences and sad that the #NLP and #AI c...
1,https://twitter.com/Mlearning_ai/status/164159...,2023-03-30 23:59:43,Hiring Now: The Top Jobs of the Future Fueled ...
2,https://twitter.com/HackerAran7/status/1641591...,2023-03-30 23:59:43,What‚Äôs the hack. #stem #science #stemeducation...
3,https://twitter.com/Stemble_/status/1641590942...,2023-03-30 23:59:14,"üöÄ Mark your calendars, Apple enthusiasts! üóìÔ∏è\n..."


In [51]:
# L√∂schen von URLs

import pandas as pd
import re

# Funktion zur Entfernung von URLs
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

# URL-Entfernung auf Spalte 'text' anwenden
df['rawContent'] = df['rawContent'].apply(remove_urls)

In [52]:
# Expand contractions (Contraction Mapping)

import pandas as pd
import contractions

# Definiere die Funktion zum Fixieren von Kontraktionen
def fix_contractions(text):
    return contractions.fix(text)

# Wende die Funktion auf die 'text'-Spalte an
df['rawContent'] = df['rawContent'].apply(fix_contractions)

In [53]:
# Tokenization

import pandas as pd
import nltk

# punkt tokenizer herunterladen
nltk.download('punkt')

# Definieren der Tokenizer-Funktion
tokenize = nltk.tokenize.word_tokenize

# Tokenisierung f√ºr jeden Tweet im Dataframe durchf√ºhren
df['rawContent'] = df['rawContent'].apply(tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
# Lowercase

import pandas as pd

# Funktion zur Umwandlung eines Tokens in Kleinbuchstaben
def lowercase(tokens):
    return [token.lower() for token in tokens]

# Lowercase-Transformation auf Spalte 'tokens' anwenden
df['rawContent'] = df['rawContent'].apply(lowercase)

In [55]:
# Punctation Removal

import pandas as pd
import string

# Hinzuf√ºgen des Apostrophzeichen zur Liste der Satzzeichen
punct = string.punctuation + "‚Äô" + "``" +"`" + "''" +"'"

# Funktion zur Entfernung von Punctation
def remove_punct(tokens):
    return [token for token in tokens if token not in punct]

# Punctation-Entfernung auf Spalte 'tokens' anwenden
df['rawContent'] = df['rawContent'].apply(remove_punct)

In [56]:
# Stopword Removal

import pandas as pd
import nltk
from nltk.corpus import stopwords

# Stopwords aus nltk herunterladen
nltk.download('stopwords')

# Liste mit Stopwords
stop_words = stopwords.words('english')

# Funktion zur Entfernung von Stopwords
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

# Stopword-Entfernung auf Spalte 'tokens' anwenden
df['rawContent'] = df['rawContent'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
# Emoji Removal
import pandas as pd
import emoji

def remove_emoji(tokens):
    return [token for token in tokens if not any(char in emoji.EMOJI_DATA for char in token)]
    
df['rawContent'] = df['rawContent'].apply(remove_emoji)

In [58]:
# Lemmatization

import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

# Initialisierung des Lemmatizers
lemmatizer = WordNetLemmatizer()

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

df['rawContent'] = df['rawContent'].apply(lemmatize)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [59]:
# Export

df.to_feather('../data/processed/twitter_tweets_processed.feather')
df.to_csv('../data/processed/twitter_tweets_processed.csv', index=False)

---