# Data Preparation

Inhaltsverzeichnis

In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
# 0. Lade Datensatz

from src import utils
import pandas as pd

# load collected posts
list_of_tweets = utils.load_pkl(path='../data/raw/twitter_tweets_raw.pkl')[0]

# transform posts into a dataframe
df = pd.DataFrame(list_of_tweets)

## Data Cleaning

In [11]:
# 1. L√∂sche alle doppelten Texte
df.drop_duplicates(subset=['rawContent'], inplace=True)

# Pr√ºfe auf Erfolg
if df['rawContent'].duplicated().any():
    print(f"{len(df[df['rawContent'].duplicated()])} Duplikate gefunden.")

In [12]:
# 2. L√∂sche alle nicht-englischen Beitr√§ge

non_english_posts = df.query('lang != "en"')
df.drop(index=non_english_posts.index, inplace=True)

# Pr√ºfe auf Erfolg
if not df['lang'].eq('en').all():
    print(df.query('lang != "en"'))

In [13]:
# 3. Date aktualisieren

df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None)

In [14]:
# 4. L√∂sche irrelevante 
df.drop(columns=['renderedContent', 'id', 'user', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount', 
                 'conversationId', 'lang', 'source', 'sourceUrl', 'sourceLabel', 'links', 'media', 
                 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId', 'inReplyToUser', 'mentionedUsers', 
                 'coordinates', 'place', 'hashtags', 'cashtags', 'card', 'viewCount', 'vibe'], inplace=True)

df = df[['date', 'rawContent', 'url']]

In [7]:
utils.safe_as_pkl(df, filename='twitter_tweets_intermediate', path='../data/intermediate')

---

## Preprocessing Pipeline

In [2]:
from src import utils
import pandas as pd

# load collected posts
list_of_tweets = utils.load_pkl(path='../data/intermediate/twitter_tweets_intermediate.pkl')[0]

# transform posts into a dataframe
df = pd.DataFrame(list_of_tweets)
df.head(4)

Unnamed: 0,date,rawContent,url
0,2023-03-30 23:59:46,My condolences and sad that the #NLP and #AI c...,https://twitter.com/YueDongCS/status/164159107...
1,2023-03-30 23:59:43,Hiring Now: The Top Jobs of the Future Fueled ...,https://twitter.com/Mlearning_ai/status/164159...
2,2023-03-30 23:59:43,What‚Äôs the hack. #stem #science #stemeducation...,https://twitter.com/HackerAran7/status/1641591...
3,2023-03-30 23:59:14,"üöÄ Mark your calendars, Apple enthusiasts! üóìÔ∏è\n...",https://twitter.com/Stemble_/status/1641590942...


In [3]:
# L√∂schen von URLs

import pandas as pd
import re

# Funktion zur Entfernung von URLs
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

# URL-Entfernung auf Spalte 'text' anwenden
df['rawContent'] = df['rawContent'].apply(remove_urls)

In [4]:
# Expand contractions (Contraction Mapping)

import pandas as pd
import contractions

# Definiere die Funktion zum Fixieren von Kontraktionen
def fix_contractions(text):
    return contractions.fix(text)

# Wende die Funktion auf die 'text'-Spalte an
df['rawContent'] = df['rawContent'].apply(fix_contractions)

In [5]:
# Tokenization

import pandas as pd
import nltk

# punkt tokenizer herunterladen
nltk.download('punkt')

# Definieren der Tokenizer-Funktion
tokenize = nltk.tokenize.word_tokenize

# Tokenisierung f√ºr jeden Tweet im Dataframe durchf√ºhren
df['rawContent'] = df['rawContent'].apply(tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Lowercase

import pandas as pd

# Funktion zur Umwandlung eines Tokens in Kleinbuchstaben
def lowercase(tokens):
    return [token.lower() for token in tokens]

# Lowercase-Transformation auf Spalte 'tokens' anwenden
df['rawContent'] = df['rawContent'].apply(lowercase)

In [7]:
# Punctation Removal

import pandas as pd
import string

# Hinzuf√ºgen des Apostrophzeichen zur Liste der Satzzeichen
punct = string.punctuation + "‚Äô" + "``" +"`" + "''" +"'"

# Funktion zur Entfernung von Punctation
def remove_punct(tokens):
    return [token for token in tokens if token not in punct]

# Punctation-Entfernung auf Spalte 'tokens' anwenden
df['rawContent'] = df['rawContent'].apply(remove_punct)

In [8]:
# Stopword Removal

import pandas as pd
import nltk
from nltk.corpus import stopwords

# Stopwords aus nltk herunterladen
nltk.download('stopwords')

# Liste mit Stopwords
stop_words = stopwords.words('english')

# Funktion zur Entfernung von Stopwords
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

# Stopword-Entfernung auf Spalte 'tokens' anwenden
df['rawContent'] = df['rawContent'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Emoji Removal
import pandas as pd
import emoji

def remove_emoji(tokens):
    return [token for token in tokens if not any(char in emoji.EMOJI_DATA for char in token)]
    
df['rawContent'] = df['rawContent'].apply(remove_emoji)

In [27]:
# Lemmatization

import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

# Initialisierung des Lemmatizers
lemmatizer = WordNetLemmatizer()

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

df['rawContent'] = df['rawContent'].apply(lemmatize)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
df.head(4)

Unnamed: 0,date,rawContent,url
0,2023-03-30 23:59:46,"[condolence, sad, nlp, ai, community, lose, br...",https://twitter.com/YueDongCS/status/164159107...
1,2023-03-30 23:59:43,"[hiring, top, job, future, fueled, generative,...",https://twitter.com/Mlearning_ai/status/164159...
2,2023-03-30 23:59:43,"[hack, stem, science, stemeducation, education...",https://twitter.com/HackerAran7/status/1641591...
3,2023-03-30 23:59:14,"[mark, calendar, apple, enthusiast, june, 5, a...",https://twitter.com/Stemble_/status/1641590942...


---