# Data Preparation

Inhaltsverzeichnis

In [43]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
# 0. Lade Datensatz

from src import utils
import pandas as pd

df = utils.load_pkl(path='../data/raw/twitter_tweets_raw.pkl')[0]

## Data Cleaning

In [45]:
# 1. Lösche alle doppelten Texte
df.drop_duplicates(subset=['rawContent'], inplace=True)

# Prüfe auf Erfolg
if df['rawContent'].duplicated().any():
    print(f"{len(df[df['rawContent'].duplicated()])} Duplikate gefunden.")

In [46]:
# 2. Lösche alle nicht-englischen Beiträge

non_english_posts = df.query('lang != "en"')
df.drop(index=non_english_posts.index, inplace=True)

# Prüfe auf Erfolg
if not df['lang'].eq('en').all():
    print(df.query('lang != "en"'))

In [47]:
# 3. Date aktualisieren

df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None)

In [48]:
# 4. Lösche irrelevante 
df.drop(columns=['renderedContent', 'id', 'user', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount', 
                 'conversationId', 'lang', 'source', 'sourceUrl', 'sourceLabel', 'links', 'media', 
                 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId', 'inReplyToUser', 'mentionedUsers', 
                 'coordinates', 'place', 'hashtags', 'cashtags', 'card', 'viewCount', 'vibe'], inplace=True)

df = df[['date', 'rawContent', 'url']]

In [49]:
df.set_index('url', inplace=True)
df.reset_index(inplace=True)
df.to_feather('../data/intermediate/twitter_tweets_intermediate.feather')

---

## Preprocessing Pipeline

In [1]:
import pandas as pd
import contractions
import nltk
import string
import emoji
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# punkt tokenizer herunterladen
nltk.download('punkt');

# Stopwords aus nltk herunterladen
nltk.download('stopwords');

nltk.download('wordnet');

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load dataframe
df = pd.read_feather('../data/intermediate/twitter_tweets_intermediate.feather')
df.head(4)

Unnamed: 0,url,date,rawContent
0,https://twitter.com/YueDongCS/status/164159107...,2023-03-30 23:59:46,My condolences and sad that the #NLP and #AI c...
1,https://twitter.com/Mlearning_ai/status/164159...,2023-03-30 23:59:43,Hiring Now: The Top Jobs of the Future Fueled ...
2,https://twitter.com/HackerAran7/status/1641591...,2023-03-30 23:59:43,What’s the hack. #stem #science #stemeducation...
3,https://twitter.com/Stemble_/status/1641590942...,2023-03-30 23:59:14,"🚀 Mark your calendars, Apple enthusiasts! 🗓️\n..."


In [3]:
# Löschen von URLs

# Funktion zur Entfernung von URLs
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

# URL-Entfernung auf Spalte 'text' anwenden
df['_nonurl'] = df['rawContent'].apply(remove_urls)

In [5]:
# Expand contractions (Contraction Mapping)

# Definiere die Funktion zum Fixieren von Kontraktionen
def fix_contractions(text):
    return contractions.fix(text)

# Wende die Funktion auf die 'text'-Spalte an
df['_nonurl_fixedcontractions'] = df['_nonurl'].apply(fix_contractions)
del df['_nonurl']

In [6]:
# Tokenization

# Definieren der Tokenizer-Funktion
tokenize = nltk.tokenize.word_tokenize

# Tokenisierung für jeden Tweet im Dataframe durchführen
df['_nonurl_fixedcontractions_tokenized'] = df['_nonurl_fixedcontractions'].apply(tokenize)
del df['_nonurl_fixedcontractions']

In [7]:
# Lowercase

# Funktion zur Umwandlung eines Tokens in Kleinbuchstaben
def lowercase(tokens):
    return [token.lower() for token in tokens]

# Lowercase-Transformation auf Spalte 'tokens' anwenden
df['_nonurl_fixedcontractions_tokenized_lowercase'] = df['_nonurl_fixedcontractions_tokenized'].apply(lowercase)
del df['_nonurl_fixedcontractions_tokenized']

In [8]:
# Punctation Removal

# Hinzufügen des Apostrophzeichen zur Liste der Satzzeichen
punct = string.punctuation + "’" + "``" +"`" + "''" +"'"

# Funktion zur Entfernung von Punctation
def remove_punct(tokens):
    return [token for token in tokens if token not in punct]

# Punctation-Entfernung auf Spalte 'tokens' anwenden
df['_nonurl_fixedcontractions_tokenized_lowercase_nonpunctation'] = df['_nonurl_fixedcontractions_tokenized_lowercase'].apply(remove_punct)
del df['_nonurl_fixedcontractions_tokenized_lowercase']

In [9]:
# Stopword Removal

# Liste mit Stopwords
stop_words = stopwords.words('english')

# Funktion zur Entfernung von Stopwords
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

# Stopword-Entfernung auf Spalte 'tokens' anwenden
df['_nonurl_fixedcontractions_tokenized_lowercase_nonpunctation_nonstopwords'] = df['_nonurl_fixedcontractions_tokenized_lowercase_nonpunctation'].apply(remove_stopwords)
del df['_nonurl_fixedcontractions_tokenized_lowercase_nonpunctation']

In [10]:
# Emoji Removal

def remove_emoji(tokens):
    return [token for token in tokens if not any(char in emoji.EMOJI_DATA for char in token)]
    
df['_nonurl_fixedcontractions_tokenized_lowercase_nonpunctation_nonstopwords_nonemoji'] = df['_nonurl_fixedcontractions_tokenized_lowercase_nonpunctation_nonstopwords'].apply(remove_emoji)
del df['_nonurl_fixedcontractions_tokenized_lowercase_nonpunctation_nonstopwords']

In [11]:
# Lemmatization

# Initialisierung des Lemmatizers
lemmatizer = WordNetLemmatizer()

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

df['preprocessed_text'] = df['_nonurl_fixedcontractions_tokenized_lowercase_nonpunctation_nonstopwords_nonemoji'].apply(lemmatize)
del df['_nonurl_fixedcontractions_tokenized_lowercase_nonpunctation_nonstopwords_nonemoji']

In [12]:
# Export

df.to_feather('../data/processed/twitter_tweets_processed.feather')
df.to_csv('../data/processed/twitter_tweets_processed.csv', index=False)

---