#### Imports & Downloads

In [None]:
!pip install emoji
!pip install pyspellchecker
!pip install textblob
!pip install ftfy
!python -m spacy download en_core_web_sm

In [1]:

import os
current_dir = os.getcwd()
src_path = os.path.abspath(os.path.join(current_dir, '../../../1. Explorative Analysis & Preprocessing/nasiba/'))
os.chdir(src_path) 

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import emoji
from nltk.corpus import stopwords, wordnet
import time
from pathlib import Path

from collections import Counter
from spellchecker import SpellChecker
import os

from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem import WordNetLemmatizer
from ftfy import fix_encoding
import spacy


from src.preprocessing.cleaning.clean_data_generic_functions import clean_misspelled_words, correct_misspelled_words_in_sentence,create_word_counter,expand_slang,expand_shortcuts,extract_emojis, to_lowercase, to_lowercase_if_string, handle_hashtags, handle_userhandles, lemmatize, remove_word_from_column
from src.preprocessing.cleaning.clean_data_generic_functions import remove_special_characters, remove_digits, remove_duplicates, remove_emoji_in_sentence, remove_emojis, remove_freqwords,remove_hashtag_sign_from_tweet,remove_least_frequent_words, remove_most_frequent_words, remove_punctuation, remove_stop_words, remove_url_from_tweet, replace_emoji_in_sentence, replace_emojis


nlp = spacy.load("en_core_web_sm")

pd.set_option('display.max_colwidth', None)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nasiba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nasiba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nasiba\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Nasiba\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Nasiba\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

### Einladen der Daten

#### Trainingsdaten

In [3]:

source_filepath_name = os.path.abspath(os.path.join(src_path, 'data/train.csv'))
df_origin = pd.read_csv(source_filepath_name, encoding='utf-8', index_col=0)

#### Testdaten

In [5]:
source_filepath_name_test = os.path.abspath(os.path.join(src_path, 'data/test.csv'))
df_origin_test = pd.read_csv(source_filepath_name_test, encoding='utf-8', index_col=0)

### Datenvorbereitung

In [6]:
df_clean_base = df_origin.copy()
df_clean_base_test = df_origin_test.copy()

#### 1. Datenbereinigung allgemein

##### Duplikatenentfernung

In [7]:
df_clean_base.drop_duplicates(inplace=True)
df_clean_base_test.drop_duplicates(inplace=True)
df_clean_base.duplicated().sum()
df_clean_base_test.duplicated().sum()


0

#### Encoding

In [8]:
df_clean_base["tweet"] = df_clean_base['tweet'].apply(fix_encoding)
df_clean_base_test["tweet"] = df_clean_base_test['tweet'].apply(fix_encoding)

#### 2. Datenbereinigung speziell für Textverarbeitung

##### Reihenfolge Datenbereinigung

Sinnvolle Reihenfolge für die Ausführung der einzelnen Bereinigungsschritte
1. Groß-/Kleinschreibung normalisieren
2. Abkürzungen/Slang auflösen
3. Falsch geschriebene Wörter korrigieren
4. Negationen auflösen (falls möglich)
5. Umgang mit User-Handles
6. Umgang mit Hashtags (in Text belassen UND in gesonderte Spalte extrahieren)
7. Emojis konvertieren (in Text konvertieren UND in gesonderte Spalte extrahieren)
8. Unnötige Zeichen entfernen
    a. Links
    b. html tags
    d. Punktuation
    e. Sonderzeichen
    f. Zeilenumbrüche
    g. Zahlen
    h. das Wort amp
9. Stemming / Lemmatization
10. Stopwords entfernen
11. Most frequent words entfernen
12. Rare words entfernen
13. Nochmal Duplikate prüfen & entfernen

Lemmatization: https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/

In [9]:
def clean_dataframe(base_df):
    df_cleaned = base_df.copy()
    df_cleaned['tweet_cleaned'] = df_cleaned['tweet']

    print("Start Cleaning")
    print("Cleaning Step 1/18: to_lowercase")
    df_cleaned = to_lowercase(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 2/18: expand_shortcuts")
    df_cleaned = expand_shortcuts(df_cleaned,'tweet_cleaned')
    # print("Cleaning Step 3/18: remove_negations - SKIP")
    # # df_cleaned = remove_negations(df_cleaned)
    print("Cleaning Step 4/18: handle_userhandles")
    df_cleaned = handle_userhandles(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 5/18: handle_hashtags")
    df_cleaned = handle_hashtags(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 6/18: extract_emojis")
    df_cleaned = extract_emojis(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 7/18: replace_emojis")
    df_cleaned = replace_emojis(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 8/18: remove_emojis - SKIP")
    # df_cleaned = remove_emojis(df_cleaned)
    print("Cleaning Step 9/18: remove_url_from_tweet")
    df_cleaned = remove_url_from_tweet(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 10/18: remove_punctuation")
    df_cleaned = remove_punctuation(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 11/18: remove_special_characters")
    df_cleaned = remove_special_characters(df_cleaned, 'tweet_cleaned')
    print("Cleaning Step 12/18: remove_digis")
    df_cleaned = remove_digits(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 13/18: remove_word_from_column: amp")
    df_cleaned = remove_word_from_column(df=df_cleaned, column_name="tweet_cleaned",word="amp")
    print("Cleaning Step 14/18: lemmatize")
    df_cleaned = lemmatize(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 15/18: remove_stop_words")
    df_cleaned = remove_stop_words(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 16/18: remove_most_frequent_words")
    df_cleaned = remove_most_frequent_words(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 17/18: remove_least_frequent_words")
    df_cleaned = remove_least_frequent_words(df_cleaned,'tweet_cleaned')
    print("Cleaning Step 18/18: remove_duplicates")
    df_cleaned = remove_duplicates(df_cleaned,'tweet_cleaned')
    print("All Cleaning done")

    return df_cleaned

#### 2.1 Datenbereinigung Train Datensatz

In [10]:

df_cleaned = clean_dataframe(df_clean_base)

Start Cleaning
Cleaning Step 1/18: to_lowercase
Cleaning Step 2/18: expand_shortcuts
Cleaning Step 4/18: handle_userhandles
Cleaning Step 5/18: handle_hashtags
Cleaning Step 6/18: extract_emojis
Cleaning Step 7/18: replace_emojis
Cleaning Step 8/18: remove_emojis - SKIP
Cleaning Step 9/18: remove_url_from_tweet
Cleaning Step 10/18: remove_punctuation
Cleaning Step 11/18: remove_special_characters
Cleaning Step 12/18: remove_digis
Cleaning Step 13/18: remove_word_from_column: amp
Cleaning Step 14/18: lemmatize
Cleaning Step 15/18: remove_stop_words
Cleaning Step 16/18: remove_most_frequent_words
Cleaning Step 17/18: remove_least_frequent_words
Cleaning Step 18/18: remove_duplicates
All Cleaning done


#### 2.2 Datenbereinigung Test Datensatz

In [11]:
df_cleaned_test = clean_dataframe(df_clean_base_test)

Start Cleaning
Cleaning Step 1/18: to_lowercase
Cleaning Step 2/18: expand_shortcuts
Cleaning Step 4/18: handle_userhandles
Cleaning Step 5/18: handle_hashtags
Cleaning Step 6/18: extract_emojis
Cleaning Step 7/18: replace_emojis
Cleaning Step 8/18: remove_emojis - SKIP
Cleaning Step 9/18: remove_url_from_tweet
Cleaning Step 10/18: remove_punctuation
Cleaning Step 11/18: remove_special_characters
Cleaning Step 12/18: remove_digis
Cleaning Step 13/18: remove_word_from_column: amp
Cleaning Step 14/18: lemmatize
Cleaning Step 15/18: remove_stop_words
Cleaning Step 16/18: remove_most_frequent_words
Cleaning Step 17/18: remove_least_frequent_words
Cleaning Step 18/18: remove_duplicates
All Cleaning done


### Die Ergebnisse speichern

In [None]:

filepath_name = os.path.abspath(os.path.join(src_path, 'data/train_cleaned.csv'))
df_cleaned.to_csv(filepath_name)


In [None]:
filepath_name = os.path.abspath(os.path.join(src_path, 'data/test_cleaned.csv'))
df_cleaned_test.to_csv(filepath_name)