In [21]:
import pandas as pd
import numpy as np

In [10]:
def print_distribution(df, col):
    positive = np.count_nonzero(df[col] == 1)
    negative = np.count_nonzero(df[col] == 0)
    print("- Positive:", positive)
    print("- Negative:", negative)
    print("- Verhältnis:", negative / positive)

### Erstellung neuer Train und Testdatensatz

In [11]:
df_train_old = pd.read_csv("twitter_hate-speech-alt/train.csv", index_col=0)
df_train_old.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation


In [12]:
print_distribution(df_train_old, "label")

- Positive: 2242
- Negative: 29720
- Verhältnis: 13.256021409455842


In [13]:
train = df_train_old.sample(frac=0.7, random_state=200)
print_distribution(train, "label")

- Positive: 1574
- Negative: 20799
- Verhältnis: 13.214104193138501


In [17]:
train.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
8886,0,@user #cinemaaawards final rehearsals!! geari...
909,0,istg this is the best cheese ta but dayum expe...
27613,0,this was amazing. the weather was not. #musica...
15999,0,yes! #talented #sexy âcriminal mindsâ ca...
23817,0,want to be while being #successful? see how ...


In [14]:
test = df_train_old.drop(train.index)
print_distribution(test, "label")

- Positive: 668
- Negative: 8921
- Verhältnis: 13.354790419161677


In [18]:
test.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation
6,0,[2/2] huge fan fare and big talking before the...
16,0,ouch...junior is angryð#got7 #junior #yugyo...
18,1,retweet if you agree!


In [16]:
train.to_csv("twitter_hate-speech/train.csv")
test.to_csv("twitter_hate-speech/test.csv")

### Bereinigung und Vorverarbeitung

In [49]:
from ftfy import fix_encoding

from src.functions.clean_data_generic_functions import to_lowercase, expand_shortcuts, handle_userhandles, \
    handle_hashtags, extract_emojis, replace_emojis, replace_text_smileys, remove_url_from_tweet, remove_punctuation, \
    remove_special_characters, remove_digits, remove_word_from_column, lemmatize, remove_stop_words, \
    remove_most_frequent_words, remove_least_frequent_words, remove_duplicates, remove_na_from_column

#### Bereinigung und Vorverarbeitung: TRAIN

##### Geeignet für: Maschinelles Lernen
es werden alle implementierten Cleaning und Preprocessing Schritte ausgeführt AUßER: Auflösen Negationen (nicht implementiert), Emojis Entfernen

In [64]:
df_origin_train = pd.read_csv('./twitter_hate-speech/train.csv', index_col=0)
df_clean_base_train = df_origin_train.copy()

In [65]:
df_clean_base_train.drop_duplicates(inplace=True)
df_clean_base_train["tweet"] = df_clean_base_train['tweet'].apply(fix_encoding)

In [66]:
def clean_dataframe_train(base_df):
    i = 1
    count = 20
    df_cleaned = base_df.copy()
    df_cleaned['tweet_cleaned'] = df_cleaned['tweet']

    print("Start Cleaning")

    print(f"Cleaning Step {i}/{count}: to_lowercase")
    df_cleaned = to_lowercase(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: expand_shortcuts")
    df_cleaned = expand_shortcuts(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_negations - SKIP")
    # # df_cleaned = remove_negations(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_userhandles")
    df_cleaned = handle_userhandles(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_hashtags")
    df_cleaned = handle_hashtags(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: extract_emojis")
    df_cleaned = extract_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_emojis")
    df_cleaned = replace_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_smileys")
    df_cleaned = replace_text_smileys(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_emojis - SKIP")
    # df_cleaned = remove_emojis(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_url_from_tweet")
    df_cleaned = remove_url_from_tweet(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_punctuation")
    df_cleaned = remove_punctuation(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_special_characters")
    df_cleaned = remove_special_characters(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_digis")
    df_cleaned = remove_digits(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_word_from_column: amp")
    df_cleaned = remove_word_from_column(df=df_cleaned, column_name="tweet_cleaned", word="amp")
    i = i + 1

    print(f"Cleaning Step {i}/{count}: lemmatize")
    df_cleaned = lemmatize(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_stop_words")
    df_cleaned = remove_stop_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_most_frequent_words")
    df_cleaned = remove_most_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_least_frequent_words")
    df_cleaned = remove_least_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_duplicates")
    df_cleaned = remove_duplicates(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_nans")
    df_cleaned = remove_na_from_column(df=df_cleaned, column_name="tweet_cleaned")

    print("All Cleaning done")

    return df_cleaned

In [68]:
df_cleaned_train = clean_dataframe_train(df_clean_base_train)

Start Cleaning
Cleaning Step 1/20: to_lowercase
Cleaning Step 2/20: expand_shortcuts
Cleaning Step 3/20: remove_negations - SKIP
Cleaning Step 4/20: handle_userhandles
Cleaning Step 5/20: handle_hashtags
Cleaning Step 6/20: extract_emojis
Cleaning Step 7/20: replace_emojis
Cleaning Step 8/20: replace_smileys
Cleaning Step 9/20: remove_emojis - SKIP
Cleaning Step 10/20: remove_url_from_tweet
Cleaning Step 11/20: remove_punctuation
Cleaning Step 12/20: remove_special_characters
Cleaning Step 13/20: remove_digis
Cleaning Step 14/20: remove_word_from_column: amp
Cleaning Step 15/20: lemmatize
Cleaning Step 16/20: remove_stop_words
Cleaning Step 17/20: remove_most_frequent_words
Cleaning Step 18/20: remove_least_frequent_words
Cleaning Step 19/20: remove_duplicates
Cleaning Step 20/20: remove_nans
All Cleaning done


In [69]:
df_cleaned_train.head()

Unnamed: 0_level_0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8886,0,@user #cinemaaawards final rehearsals!! gearing up for the evening!! #butterflies #stage ! hope u all like it,final rehearsal gear evening butterfly stage hope like,1,"[#cinemaaawards, #butterflies, #stage]",
909,0,istg this is the best cheese ta but dayum expensive,good cheese ta expensive,0,[],
27613,0,this was amazing. the weather was not. #musical #london #matilda #westend #weekend …,amazing weather musical london weekend,0,"[#musical, #london, #matilda, #westend, #weekend]",
15999,0,yes! #talented #sexy ‘criminal minds’ casts @user as series regular for season 12 via @user,yes talented sexy ' criminal mind ' cast series regular season via,2,"[#talented, #sexy]",
23817,0,want to be while being #successful? see how #worklifebalance helps in by @user @user,want successful see help,2,"[#successful, #worklifebalance]",


In [70]:
df_cleaned_train.to_csv('./twitter_hate-speech/train_cleaned.csv')

##### Geeignet für: Deep Learning (RNN)
es werden nicht alle implementierten Cleaning und Preprocessing Schritte ausgeführt

NICHT: Auflösen Negationen (nicht implementiert), Emojis Entfernen, Lemmatisierung, Stopwords entfernen, Most/Least frequent words entfernen

In [71]:
df_origin_train_rnn = pd.read_csv('./twitter_hate-speech/train.csv', index_col=0)
df_clean_base_train_rnn = df_origin_train_rnn.copy()

In [72]:
df_clean_base_train_rnn.drop_duplicates(inplace=True)
df_clean_base_train_rnn["tweet"] = df_clean_base_train_rnn['tweet'].apply(fix_encoding)

In [73]:
def clean_dataframe_train_rnn(base_df):
    i = 1
    count = 20
    df_cleaned = base_df.copy()
    df_cleaned['tweet_cleaned'] = df_cleaned['tweet']

    print("Start Cleaning")

    print(f"Cleaning Step {i}/{count}: to_lowercase")
    df_cleaned = to_lowercase(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: expand_shortcuts")
    df_cleaned = expand_shortcuts(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_negations - SKIP")
    # # df_cleaned = remove_negations(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_userhandles")
    df_cleaned = handle_userhandles(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_hashtags")
    df_cleaned = handle_hashtags(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: extract_emojis")
    df_cleaned = extract_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_emojis")
    df_cleaned = replace_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_smileys")
    df_cleaned = replace_text_smileys(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_emojis - SKIP")
    # df_cleaned = remove_emojis(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_url_from_tweet")
    df_cleaned = remove_url_from_tweet(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_punctuation")
    df_cleaned = remove_punctuation(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_special_characters")
    df_cleaned = remove_special_characters(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_digis")
    df_cleaned = remove_digits(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_word_from_column: amp")
    df_cleaned = remove_word_from_column(df=df_cleaned, column_name="tweet_cleaned", word="amp")
    i = i + 1

    print(f"Cleaning Step {i}/{count}: lemmatize  - SKIP")
    # df_cleaned = lemmatize(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_stop_words  - SKIP")
    # df_cleaned = remove_stop_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_most_frequent_words  - SKIP")
    # df_cleaned = remove_most_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_least_frequent_words  - SKIP")
    # df_cleaned = remove_least_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_duplicates")
    df_cleaned = remove_duplicates(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_nans")
    df_cleaned = remove_na_from_column(df=df_cleaned, column_name="tweet_cleaned")

    print("All Cleaning done")

    return df_cleaned

In [74]:
df_cleaned_train_rnn = clean_dataframe_train_rnn(df_clean_base_train_rnn)

Start Cleaning
Cleaning Step 1/20: to_lowercase
Cleaning Step 2/20: expand_shortcuts
Cleaning Step 3/20: remove_negations - SKIP
Cleaning Step 4/20: handle_userhandles
Cleaning Step 5/20: handle_hashtags
Cleaning Step 6/20: extract_emojis
Cleaning Step 7/20: replace_emojis
Cleaning Step 8/20: replace_smileys
Cleaning Step 9/20: remove_emojis - SKIP
Cleaning Step 10/20: remove_url_from_tweet
Cleaning Step 11/20: remove_punctuation
Cleaning Step 12/20: remove_special_characters
Cleaning Step 13/20: remove_digis
Cleaning Step 14/20: remove_word_from_column: amp
Cleaning Step 15/20: lemmatize  - SKIP
Cleaning Step 16/20: remove_stop_words  - SKIP
Cleaning Step 17/20: remove_most_frequent_words  - SKIP
Cleaning Step 18/20: remove_least_frequent_words  - SKIP
Cleaning Step 19/20: remove_duplicates
Cleaning Step 20/20: remove_nans
All Cleaning done


In [75]:
df_cleaned_train_rnn.head()

Unnamed: 0_level_0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8886,0,@user #cinemaaawards final rehearsals!! gearing up for the evening!! #butterflies #stage ! hope u all like it,cinemaaawards final rehearsals gearing up for the evening butterflies stage hope You all like it,1,"[#cinemaaawards, #butterflies, #stage]",
909,0,istg this is the best cheese ta but dayum expensive,istg this is the best cheese ta but dayum expensive,0,[],
27613,0,this was amazing. the weather was not. #musical #london #matilda #westend #weekend …,this was amazing the weather was not musical london matilda westend weekend …,0,"[#musical, #london, #matilda, #westend, #weekend]",
15999,0,yes! #talented #sexy ‘criminal minds’ casts @user as series regular for season 12 via @user,yes talented sexy ‘criminal minds’ casts as series regular for season via,2,"[#talented, #sexy]",
23817,0,want to be while being #successful? see how #worklifebalance helps in by @user @user,want to be while being successful see how worklifebalance helps in by,2,"[#successful, #worklifebalance]",


In [76]:
df_cleaned_train_rnn.to_csv('./twitter_hate-speech/train_cleaned_rnn.csv')

#### Bereinigung und Vorverarbeitung: TEST

##### Geeignet für: Maschinelles Lernen
es werden alle implementierten Cleaning und Preprocessing Schritte ausgeführt AUßER welche, bei denen Zeilen gelöscht werden

In [77]:
df_origin_test = pd.read_csv('./twitter_hate-speech/test.csv', index_col=0)
df_clean_base_test = df_origin_test.copy()

In [78]:
df_clean_base_test.drop_duplicates(inplace=True)
df_clean_base_test["tweet"] = df_clean_base_test['tweet'].apply(fix_encoding)

In [79]:
def clean_dataframe_test(base_df):
    i = 1
    count = 20
    df_cleaned = base_df.copy()
    df_cleaned['tweet_cleaned'] = df_cleaned['tweet']

    print("Start Cleaning")

    print(f"Cleaning Step {i}/{count}: to_lowercase")
    df_cleaned = to_lowercase(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: expand_shortcuts")
    df_cleaned = expand_shortcuts(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_negations - SKIP")
    # # df_cleaned = remove_negations(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_userhandles")
    df_cleaned = handle_userhandles(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_hashtags")
    df_cleaned = handle_hashtags(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: extract_emojis")
    df_cleaned = extract_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_emojis")
    df_cleaned = replace_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_smileys")
    df_cleaned = replace_text_smileys(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_emojis - SKIP")
    # df_cleaned = remove_emojis(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_url_from_tweet")
    df_cleaned = remove_url_from_tweet(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_punctuation")
    df_cleaned = remove_punctuation(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_special_characters")
    df_cleaned = remove_special_characters(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_digis")
    df_cleaned = remove_digits(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_word_from_column: amp")
    df_cleaned = remove_word_from_column(df=df_cleaned, column_name="tweet_cleaned", word="amp")
    i = i + 1

    print(f"Cleaning Step {i}/{count}: lemmatize")
    df_cleaned = lemmatize(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_stop_words")
    df_cleaned = remove_stop_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_most_frequent_words")
    df_cleaned = remove_most_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_least_frequent_words")
    df_cleaned = remove_least_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_duplicates - SKIP")
    # df_cleaned = remove_duplicates(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_nans")
    df_cleaned = remove_na_from_column(df=df_cleaned, column_name="tweet_cleaned")

    print("All Cleaning done")

    return df_cleaned

In [80]:
df_cleaned_test = clean_dataframe_test(df_clean_base_test)

Start Cleaning
Cleaning Step 1/20: to_lowercase
Cleaning Step 2/20: expand_shortcuts
Cleaning Step 3/20: remove_negations - SKIP
Cleaning Step 4/20: handle_userhandles
Cleaning Step 5/20: handle_hashtags
Cleaning Step 6/20: extract_emojis
Cleaning Step 7/20: replace_emojis
Cleaning Step 8/20: replace_smileys
Cleaning Step 9/20: remove_emojis - SKIP
Cleaning Step 10/20: remove_url_from_tweet
Cleaning Step 11/20: remove_punctuation
Cleaning Step 12/20: remove_special_characters
Cleaning Step 13/20: remove_digis
Cleaning Step 14/20: remove_word_from_column: amp
Cleaning Step 15/20: lemmatize
Cleaning Step 16/20: remove_stop_words
Cleaning Step 17/20: remove_most_frequent_words
Cleaning Step 18/20: remove_least_frequent_words
Cleaning Step 19/20: remove_duplicates - SKIP
Cleaning Step 20/20: remove_nans
All Cleaning done


In [81]:
df_cleaned_test.head()

Unnamed: 0_level_0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,0,#model i love u take with u all the time in ur📱!!! 😙😎👄👅💦💦💦,model take time mobile phone kiss sunglass mouth tongue sweat droplet sweat droplet sweat droplet,0,[#model],":mobile_phone:,:kissing_face_with_smiling_eyes:,:smiling_face_with_sunglasses:,:mouth:,:tongue:,:sweat_droplets:,:sweat_droplets:,:sweat_droplets:"
5,0,factsguide: society now #motivation,factsguide society motivation,0,[#motivation],
6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo,huge fan big leave chaos pay get,0,[#allshowandnogo],
16,0,ouch...junior is angry😐#got7 #junior #yugyoem #omg,angry get junior omg,0,"[#got7, #junior, #yugyoem, #omg]",:neutral_face:
18,1,retweet if you agree!,retweet agree,0,[],


In [82]:
df_cleaned_test.to_csv('./twitter_hate-speech/test_cleaned.csv')

##### Geeignet für: Deep Learning (RNN)
es werden nicht alle implementierten Cleaning und Preprocessing Schritte ausgeführt

NICHT: Duplikate entfernen, Auflösen Negationen (nicht implementiert), Emojis Entfernen, Lemmatisierung, Stopwords entfernen, Most/Least frequent words entfernen

In [83]:
df_origin_test_rnn = pd.read_csv('./twitter_hate-speech/test.csv', index_col=0)
df_clean_base_test_rnn = df_origin_test_rnn.copy()

In [84]:
df_clean_base_test_rnn.drop_duplicates(inplace=True)
df_clean_base_test_rnn["tweet"] = df_clean_base_test_rnn['tweet'].apply(fix_encoding)

In [85]:
def clean_dataframe_test_rnn(base_df):
    i = 1
    count = 20
    df_cleaned = base_df.copy()
    df_cleaned['tweet_cleaned'] = df_cleaned['tweet']

    print("Start Cleaning")

    print(f"Cleaning Step {i}/{count}: to_lowercase")
    df_cleaned = to_lowercase(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: expand_shortcuts")
    df_cleaned = expand_shortcuts(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_negations - SKIP")
    # # df_cleaned = remove_negations(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_userhandles")
    df_cleaned = handle_userhandles(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_hashtags")
    df_cleaned = handle_hashtags(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: extract_emojis")
    df_cleaned = extract_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_emojis")
    df_cleaned = replace_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_smileys")
    df_cleaned = replace_text_smileys(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_emojis - SKIP")
    # df_cleaned = remove_emojis(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_url_from_tweet")
    df_cleaned = remove_url_from_tweet(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_punctuation")
    df_cleaned = remove_punctuation(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_special_characters")
    df_cleaned = remove_special_characters(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_digis")
    df_cleaned = remove_digits(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_word_from_column: amp")
    df_cleaned = remove_word_from_column(df=df_cleaned, column_name="tweet_cleaned", word="amp")
    i = i + 1

    print(f"Cleaning Step {i}/{count}: lemmatize - SKIP")
    # df_cleaned = lemmatize(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_stop_words - SKIP")
    # df_cleaned = remove_stop_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_most_frequent_words - SKIP")
    # df_cleaned = remove_most_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_least_frequent_words - SKIP")
    # df_cleaned = remove_least_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_duplicates  - SKIP")
    # df_cleaned = remove_duplicates(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_nans")
    df_cleaned = remove_na_from_column(df=df_cleaned, column_name="tweet_cleaned")

    print("All Cleaning done")

    return df_cleaned

In [86]:
df_cleaned_test_rnn = clean_dataframe_test_rnn(df_clean_base_test_rnn)

Start Cleaning
Cleaning Step 1/20: to_lowercase
Cleaning Step 2/20: expand_shortcuts
Cleaning Step 3/20: remove_negations - SKIP
Cleaning Step 4/20: handle_userhandles
Cleaning Step 5/20: handle_hashtags
Cleaning Step 6/20: extract_emojis
Cleaning Step 7/20: replace_emojis
Cleaning Step 8/20: replace_smileys
Cleaning Step 9/20: remove_emojis - SKIP
Cleaning Step 10/20: remove_url_from_tweet
Cleaning Step 11/20: remove_punctuation
Cleaning Step 12/20: remove_special_characters
Cleaning Step 13/20: remove_digis
Cleaning Step 14/20: remove_word_from_column: amp
Cleaning Step 15/20: lemmatize - SKIP
Cleaning Step 16/20: remove_stop_words - SKIP
Cleaning Step 17/20: remove_most_frequent_words - SKIP
Cleaning Step 18/20: remove_least_frequent_words - SKIP
Cleaning Step 19/20: remove_duplicates  - SKIP
Cleaning Step 20/20: remove_nans
All Cleaning done


In [87]:
df_cleaned_test_rnn.head()

Unnamed: 0_level_0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,0,#model i love u take with u all the time in ur📱!!! 😙😎👄👅💦💦💦,model i love You take with You all the time in ur mobile phone kissing face with smiling eyes smiling face with sunglasses mouth tongue sweat droplets sweat droplets sweat droplets,0,[#model],":mobile_phone:,:kissing_face_with_smiling_eyes:,:smiling_face_with_sunglasses:,:mouth:,:tongue:,:sweat_droplets:,:sweat_droplets:,:sweat_droplets:"
5,0,factsguide: society now #motivation,factsguide society now motivation,0,[#motivation],
6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo,huge fan fare and big talking before they leave chaos and pay disputes when they get there allshowandnogo,0,[#allshowandnogo],
16,0,ouch...junior is angry😐#got7 #junior #yugyoem #omg,ouchjunior is angry neutral face got junior yugyoem omg,0,"[#got7, #junior, #yugyoem, #omg]",:neutral_face:
18,1,retweet if you agree!,retweet if you agree,0,[],


In [88]:
df_cleaned_test_rnn.to_csv('./twitter_hate-speech/test_cleaned_rnn.csv')

#### Bereinigung und Vorverarbeitung: PREDICT

##### Geeignet für: Maschinelles Lernen
es werden alle implementierten Cleaning und Preprocessing Schritte ausgeführt AUßER welche, bei denen Zeilen gelöscht werden

In [89]:
df_origin_predict = pd.read_csv('./twitter_hate-speech/predict.csv', index_col=0)
df_clean_base_predict = df_origin_predict.copy()

In [90]:
df_clean_base_predict.drop_duplicates(inplace=True)
df_clean_base_predict["tweet"] = df_clean_base_predict['tweet'].apply(fix_encoding)

In [91]:
def clean_dataframe_predict(base_df):
    i = 1
    count = 20
    df_cleaned = base_df.copy()
    df_cleaned['tweet_cleaned'] = df_cleaned['tweet']

    print("Start Cleaning")

    print(f"Cleaning Step {i}/{count}: to_lowercase")
    df_cleaned = to_lowercase(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: expand_shortcuts")
    df_cleaned = expand_shortcuts(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_negations - SKIP")
    # # df_cleaned = remove_negations(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_userhandles")
    df_cleaned = handle_userhandles(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_hashtags")
    df_cleaned = handle_hashtags(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: extract_emojis")
    df_cleaned = extract_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_emojis")
    df_cleaned = replace_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_smileys")
    df_cleaned = replace_text_smileys(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_emojis - SKIP")
    # df_cleaned = remove_emojis(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_url_from_tweet")
    df_cleaned = remove_url_from_tweet(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_punctuation")
    df_cleaned = remove_punctuation(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_special_characters")
    df_cleaned = remove_special_characters(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_digis")
    df_cleaned = remove_digits(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_word_from_column: amp")
    df_cleaned = remove_word_from_column(df=df_cleaned, column_name="tweet_cleaned", word="amp")
    i = i + 1

    print(f"Cleaning Step {i}/{count}: lemmatize")
    df_cleaned = lemmatize(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_stop_words")
    df_cleaned = remove_stop_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_most_frequent_words")
    df_cleaned = remove_most_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_least_frequent_words")
    df_cleaned = remove_least_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_duplicates - SKIP")
    # df_cleaned = remove_duplicates(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_nans")
    df_cleaned = remove_na_from_column(df=df_cleaned, column_name="tweet_cleaned")

    print("All Cleaning done")

    return df_cleaned

In [92]:
df_cleaned_predict = clean_dataframe_predict(df_clean_base_predict)

Start Cleaning
Cleaning Step 1/20: to_lowercase
Cleaning Step 2/20: expand_shortcuts
Cleaning Step 3/20: remove_negations - SKIP
Cleaning Step 4/20: handle_userhandles
Cleaning Step 5/20: handle_hashtags
Cleaning Step 6/20: extract_emojis
Cleaning Step 7/20: replace_emojis
Cleaning Step 8/20: replace_smileys
Cleaning Step 9/20: remove_emojis - SKIP
Cleaning Step 10/20: remove_url_from_tweet
Cleaning Step 11/20: remove_punctuation
Cleaning Step 12/20: remove_special_characters
Cleaning Step 13/20: remove_digis
Cleaning Step 14/20: remove_word_from_column: amp
Cleaning Step 15/20: lemmatize
Cleaning Step 16/20: remove_stop_words
Cleaning Step 17/20: remove_most_frequent_words
Cleaning Step 18/20: remove_least_frequent_words
Cleaning Step 19/20: remove_duplicates - SKIP
Cleaning Step 20/20: remove_nans
All Cleaning done


In [93]:
df_cleaned_predict.head()

Unnamed: 0_level_0,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
31963,#studiolife #aislife #requires #passion #dedication #willpower to find #newmaterials…,require passion find,0,"[#studiolife, #aislife, #requires, #passion, #dedication, #willpower, #newmaterials]",
31964,@user #white #supremacists want everyone to see the new ‘ #birds’ #movie — and here’s why,white want everyone see new ' bird ' movie — ’,1,"[#white, #supremacists, #birds, #movie]",
31965,safe ways to heal your #acne!! #altwaystoheal #healthy #healing!!,safe way heal altwaystoheal healthy healing,0,"[#acne, #altwaystoheal, #healthy, #healing]",
31966,"is the hp and the cursed child book up for reservations already? if yes, where? if no, when? 😍😍😍 #harrypotter #pottermore #favorite",child book already yes harrypotter favorite,0,"[#harrypotter, #pottermore, #favorite]",":smiling_face_with_heart-eyes:,:smiling_face_with_heart-eyes:,:smiling_face_with_heart-eyes:"
31967,"3rd #bihday to my amazing, hilarious #nephew eli ahmir! uncle dave loves you and misses…",rd bihday amazing hilarious nephew uncle dave miss,0,"[#bihday, #nephew]",


In [94]:
df_cleaned_predict.to_csv('./twitter_hate-speech/predict_cleaned.csv')

##### Geeignet für: Deep Learning (RNN)
es werden nicht alle implementierten Cleaning und Preprocessing Schritte ausgeführt

NICHT: Duplikate entfernen, Auflösen Negationen (nicht implementiert), Emojis Entfernen, Lemmatisierung, Stopwords entfernen, Most/Least frequent words entfernen

In [95]:
df_origin_predict_rnn = pd.read_csv('./twitter_hate-speech/predict.csv', index_col=0)
df_clean_base_predict_rnn = df_origin_predict_rnn.copy()

In [96]:
df_clean_base_predict_rnn.drop_duplicates(inplace=True)
df_clean_base_predict_rnn["tweet"] = df_clean_base_predict_rnn['tweet'].apply(fix_encoding)

In [97]:
def clean_dataframe_predict_rnn(base_df):
    i = 1
    count = 20
    df_cleaned = base_df.copy()
    df_cleaned['tweet_cleaned'] = df_cleaned['tweet']

    print("Start Cleaning")

    print(f"Cleaning Step {i}/{count}: to_lowercase")
    df_cleaned = to_lowercase(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: expand_shortcuts")
    df_cleaned = expand_shortcuts(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_negations - SKIP")
    # # df_cleaned = remove_negations(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_userhandles")
    df_cleaned = handle_userhandles(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: handle_hashtags")
    df_cleaned = handle_hashtags(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: extract_emojis")
    df_cleaned = extract_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_emojis")
    df_cleaned = replace_emojis(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: replace_smileys")
    df_cleaned = replace_text_smileys(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_emojis - SKIP")
    # df_cleaned = remove_emojis(df_cleaned)
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_url_from_tweet")
    df_cleaned = remove_url_from_tweet(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_punctuation")
    df_cleaned = remove_punctuation(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_special_characters")
    df_cleaned = remove_special_characters(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_digis")
    df_cleaned = remove_digits(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_word_from_column: amp")
    df_cleaned = remove_word_from_column(df=df_cleaned, column_name="tweet_cleaned", word="amp")
    i = i + 1

    print(f"Cleaning Step {i}/{count}: lemmatize - SKIP")
    # df_cleaned = lemmatize(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_stop_words - SKIP")
    # df_cleaned = remove_stop_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_most_frequent_words - SKIP")
    # df_cleaned = remove_most_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_least_frequent_words - SKIP")
    # df_cleaned = remove_least_frequent_words(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_duplicates  - SKIP")
    # df_cleaned = remove_duplicates(df_cleaned, 'tweet_cleaned')
    i = i + 1

    print(f"Cleaning Step {i}/{count}: remove_nans")
    df_cleaned = remove_na_from_column(df=df_cleaned, column_name="tweet_cleaned")

    print("All Cleaning done")

    return df_cleaned

In [98]:
df_cleaned_predict_rnn = clean_dataframe_predict_rnn(df_clean_base_predict_rnn)

Start Cleaning
Cleaning Step 1/20: to_lowercase
Cleaning Step 2/20: expand_shortcuts
Cleaning Step 3/20: remove_negations - SKIP
Cleaning Step 4/20: handle_userhandles
Cleaning Step 5/20: handle_hashtags
Cleaning Step 6/20: extract_emojis
Cleaning Step 7/20: replace_emojis
Cleaning Step 8/20: replace_smileys
Cleaning Step 9/20: remove_emojis - SKIP
Cleaning Step 10/20: remove_url_from_tweet
Cleaning Step 11/20: remove_punctuation
Cleaning Step 12/20: remove_special_characters
Cleaning Step 13/20: remove_digis
Cleaning Step 14/20: remove_word_from_column: amp
Cleaning Step 15/20: lemmatize - SKIP
Cleaning Step 16/20: remove_stop_words - SKIP
Cleaning Step 17/20: remove_most_frequent_words - SKIP
Cleaning Step 18/20: remove_least_frequent_words - SKIP
Cleaning Step 19/20: remove_duplicates  - SKIP
Cleaning Step 20/20: remove_nans
All Cleaning done


In [99]:
df_cleaned_predict_rnn.head()

Unnamed: 0_level_0,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
31963,#studiolife #aislife #requires #passion #dedication #willpower to find #newmaterials…,studiolife aislife requires passion dedication willpower to find newmaterials…,0,"[#studiolife, #aislife, #requires, #passion, #dedication, #willpower, #newmaterials]",
31964,@user #white #supremacists want everyone to see the new ‘ #birds’ #movie — and here’s why,white supremacists want everyone to see the new ‘ birds’ movie — and here’s why,1,"[#white, #supremacists, #birds, #movie]",
31965,safe ways to heal your #acne!! #altwaystoheal #healthy #healing!!,safe ways to heal your acne altwaystoheal healthy healing,0,"[#acne, #altwaystoheal, #healthy, #healing]",
31966,"is the hp and the cursed child book up for reservations already? if yes, where? if no, when? 😍😍😍 #harrypotter #pottermore #favorite",is the hp and the cursed child book up for reservations already if yes where if no when smiling face with heart-eyes smiling face with heart-eyes smiling face with heart-eyes harrypotter pottermore favorite,0,"[#harrypotter, #pottermore, #favorite]",":smiling_face_with_heart-eyes:,:smiling_face_with_heart-eyes:,:smiling_face_with_heart-eyes:"
31967,"3rd #bihday to my amazing, hilarious #nephew eli ahmir! uncle dave loves you and misses…",rd bihday to my amazing hilarious nephew eli ahmir uncle dave loves you and misses…,0,"[#bihday, #nephew]",


In [100]:
df_cleaned_predict_rnn.to_csv('./twitter_hate-speech/predict_cleaned_rnn.csv')