# Imports & Downloads

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [None]:
stop_words = stopwords.words()

# Loading the data

In [None]:
#  avoid characters of text column to be cut off by default Pandas DataFrame limitations
pd.set_option('display.max_colwidth', 400)

In [None]:
df_raw_val = pd.read_csv("C:\\Users\\admin\\PycharmProjects\\WebScience24\\data\\twitter_hate-speech\\test.csv",
                     index_col=0)
df_raw_val.head()

In [None]:
df_raw_train = pd.read_csv("C:\\Users\\admin\\PycharmProjects\\WebScience24\\data\\twitter_hate-speech\\train.csv",
                      index_col=0, encoding="utf-8")
df_raw_train.head()

In [None]:
df_raw_train[df_raw_train.label == 0]

## Save val data

In [None]:
train_df = df_raw_train.sample(frac = 0.8)
train_df.shape

In [None]:
val_df = df_raw_val.drop(train_df.index)
val_df.shape

In [None]:
val_df.to_csv(filepath)

# Preparing the data

In [None]:
df = train_df

In [None]:
df.info()

## Duplicate-Check

In [None]:
df.duplicated().sum()

=> bei der Menge an Daten (ca. 75k) sollten die Duplikate entfernt werden

In [None]:
df = df.drop_duplicates()
df.duplicated().sum()

## Delete Noise // Irrelevant Data

In [None]:
df = df[df['sentiment'] != 'Irrelevant']

## Null-Check

In [None]:
df.isna().sum()

=> es fehlen nur Einträge bei Tweet, dabei handelt es sich aber um das entscheidende Feature; ohne Tweet ist der Eintrag wertlos, daher droppen

In [None]:
df.dropna(axis="rows", inplace=True)
df.isna().sum()

## Inconsistent text & typos

In [None]:
print('Column: Topic')
print(df['topic'].value_counts().sort_index())

=> keine Typos erkennbar
=> Doppelung von CallOfDuty / CallOfDutyBlackopsColdWar -> ggf. zusammenlegen

In [None]:
df[(df.topic == "CallOfDuty")]

In [None]:
blackops_keywords = "ops|Ops"
df[(df.topic == "CallOfDuty") & (df.tweet.str.contains(blackops_keywords))]

In [None]:
df[(df.topic == "CallOfDutyBlackopsColdWar")]

In [None]:
cod_keywords = "verdansk|Verdansk|warzone|Warzone|modern|Modern|warfare|Warfare"
df[(df.topic == "CallOfDutyBlackopsColdWar") & (df.tweet.str.contains(cod_keywords))]

In [None]:
print('Column: sentiment')
print(df['sentiment'].value_counts())

=> keine Typos erkennbar

## Outlier

In [None]:
# Spalte: Topic
print(df['topic'].value_counts())

In [None]:
# Spalte: Tweet
# Ansatz: prüfen, ob es Ausreißer bei der Länge und/oder bei der Wortzahl pro Tweets gibt
df["char_count"] = df["tweet"].str.len()
df["word_count"] = df["tweet"].str.split().str.len()
df = df.reset_index(drop=True)

In [None]:
df["char_count"].describe()

In [None]:
sns.histplot(df["char_count"])

plt.title('Character Count of all Tweets')

plt.xlabel('Character Count')
plt.ylabel('Number of Tweets')

In [None]:
sns.boxplot(df["char_count"])

plt.title('Distribution of Number of Characters per Tweet')

plt.ylabel('Character Count')

In [None]:
df["word_count"].describe()

In [None]:
sns.histplot(df["word_count"])

plt.title('Count of Words of all Tweets')

plt.xlabel('Word Count')
plt.ylabel('Number of Tweets')

In [None]:
sns.boxplot(data=df["word_count"])

plt.title('Distribution of Words per Tweet')

plt.ylabel('Number of Words')

In [None]:
# Ausreißer auf numerische Art bestimmen: 3 Standardabweichungen vom Mittelwert entfernt = Ausreißer
def find_outlier(data):
    for num_col in data.select_dtypes(["number"]).columns:
        sd = data[num_col].std()
        mean = data[num_col].mean()
        result = [out for out in data[num_col] if (out > mean + 3 * sd) or (out < mean - 3 * sd)]
        print()
        print("Column: ", num_col)
        print("Mean: ", mean, "; Std: ", sd)
        print("Outlier: ", len(result), "; Values: ", sorted(result))

In [None]:
find_outlier(df[["char_count", "word_count"]])

In [None]:
df[df["char_count"] >= 349]

In [None]:
df[df["word_count"] >= 63]

=> Strategie: Einträge droppen, um ML Modelle nicht durch Ausreißer zu beeinflussen (bei DeepLearning-Modellen tendenziell irrelevant)

In [None]:
df = df[df["char_count"] < 349]
df[df["char_count"] >= 349]

In [None]:
df = df[df["word_count"] < 63]
df[df["word_count"] >= 63]

In [None]:
# df.drop(['Char_Count', 'Word_Count'], axis=1, inplace=True)

## Datensatz nach Bereinigung

In [None]:
print(df.count())
print(df_raw.count())
print(df.count() - df_raw.count())
print(df.count() / df_raw.count() - 1)

## Check Val/Test-Data

In [None]:
# df_val_raw = pd.read_csv("C:\\Users\\admin\\PycharmProjects\\WebScience24\\data\\twitter_hate-speech\\twitter_validation.csv", index_col=0) # Fehlerhaft

=> Bewertung Validation-Daten: sehr viele Fehler, ohne händisches Säubern ggf. nicht verwendbar, von der Menge her gering (> 2000)



## Save cleaned Data

In [None]:
filepath = Path('data/twitter_training_cleaned.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)

In [None]:
df.to_csv(filepath)

# EDA

In [None]:
df.info()

In [None]:
df.describe()

## Korrelationsanalyse

In [None]:
df["sentiment_binary"] = df["sentiment"] == "Negative"

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True, vmin=-1, vmax=1, cmap="coolwarm")

In [None]:
target_corr = df.corr(numeric_only=True)["sentiment_binary"]
target_corr.abs().sort_values(ascending=False)

Korrelationsanalyse feature<->target => keine originären numerischen Werte, bei den erzeugten numerischen Werten keine nennenswerte Korrelation erkennbar

## Targetanalyse

In [None]:
df["sentiment"].value_counts()

In [None]:
color_mapping = {
    'Negative': 'red',
    'Positive': 'green',
    'Neutral': 'orange',
    #'Irrelevant': 'grey'
}

counts = df['sentiment'].value_counts().reset_index()
counts.columns = ['sentiment', 'count']

counts['color'] = counts['sentiment'].map(color_mapping)

plt.figure(figsize=(10, 6))
ax = sns.barplot(data=counts, y='sentiment', x='count', palette=counts['color'])

for p in ax.patches:
    ax.annotate(str(int(p.get_width())), (p.get_width(), p.get_y() + p.get_height() / 2.),
                ha='left', va='center', xytext=(5, 0), textcoords='offset points')

ax.set_xlim(0, counts['count'].max() * 1.1)

ax.set_xlabel('Number of Tweets')
ax.set_ylabel('Sentiment')


plt.title('Number of Tweets per Sentiment')

plt.tight_layout()
plt.show()

## Featureanalyse

In [None]:
# Feature: Topic
sns.histplot(
    df["topic"])  # Topics sind (grob) gleichverteilt, ggf. ein Ausreißer falls beide COD-Topics zusammengelegt werden

plt.ylabel('Number of Tweets')
plt.xlabel('Topic')

plt.title('Number of Tweets per Topic')

In [None]:
# Feature: Tweet
sns.histplot(df["word_count"])

# redudant to one of the charts above (section1) -- is this intended or shall we get rid off one?

In [None]:
sns.histplot(df["char_count"])

# redudant to one of the charts above (section1) -- is this intended or shall we get rid off one?

## Feature x Target

In [None]:
#Character Count

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 8))

# Positive Tweets
ax1.hist(df[df['sentiment'] == 'Positive']['char_count'])
ax1.set_title('Positive Tweets')
ax1.set_xlabel('Character Count')  
ax1.set_ylabel('Frequency')          

# Negative Tweets
ax2.hist(df[df['sentiment'] == 'Negative']['char_count'])
ax2.set_title('Negative Tweets')
ax2.set_xlabel('Character Count')  
ax2.set_ylabel('Frequency')        

# Neutral Tweets
ax3.hist(df[df['sentiment'] == 'Neutral']['char_count'])
ax3.set_title('Neutral Tweets')
ax3.set_xlabel('Character Count') 
ax3.set_ylabel('Frequency')         

plt.suptitle('Count of Characters per Sentiment Category', fontsize=16)

plt.tight_layout(rect=[0, 0, 1, 0.95])  
plt.show()

In [None]:
# Word Count

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 8))

# Positive Tweets
ax1.hist(df[df['sentiment'] == 'Positive']['word_count'])
ax1.set_title('Positive Tweets')
ax1.set_xlabel('Word Count')  
ax1.set_ylabel('Frequency')    

# Negative Tweets
ax2.hist(df[df['sentiment'] == 'Negative']['word_count'])
ax2.set_title('Negative Tweets')
ax2.set_xlabel('Word Count')  
ax2.set_ylabel('Frequency')    

# Neutral Tweets
ax3.hist(df[df['sentiment'] == 'Neutral']['word_count'])
ax3.set_title('Neutral Tweets')
ax3.set_xlabel('Word Count')  
ax3.set_ylabel('Frequency')   

plt.suptitle('Count of Words per Sentiment Category', fontsize=16)

plt.tight_layout(rect=[0, 0, 1, 0.95])  
plt.show()

# Data Preprocessing for Naive-Bayes

## Text cleaning: emojis, toLower etc.

In [None]:
def cleaning(text):
    # converting to lowercase, removing URL links, special characters, punctuations...
    text = text.lower()  # converting to lowercase
    text = re.sub('https?://\S+|www\.\S+', '', text)  # removing URL links
    text = re.sub(r"\b\d+\b", "", text)  # removing number
    text = re.sub('<.*?>+', '', text)  # removing special characters,
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # punctuations
    text = re.sub('\n', '', text)
    text = re.sub('[’“”…]', '', text)

    #removing emoji:
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # removing short form:
    text = re.sub("isn't", 'is not', text)
    text = re.sub("he's", 'he is', text)
    text = re.sub("wasn't", 'was not', text)
    text = re.sub("there's", 'there is', text)
    text = re.sub("couldn't", 'could not', text)
    text = re.sub("won't", 'will not', text)
    text = re.sub("they're", 'they are', text)
    text = re.sub("she's", 'she is', text)
    text = re.sub("There's", 'there is', text)
    text = re.sub("wouldn't", 'would not', text)
    text = re.sub("haven't", 'have not', text)
    text = re.sub("That's", 'That is', text)
    text = re.sub("you've", 'you have', text)
    text = re.sub("He's", 'He is', text)
    text = re.sub("what's", 'what is', text)
    text = re.sub("weren't", 'were not', text)
    text = re.sub("we're", 'we are', text)
    text = re.sub("hasn't", 'has not', text)
    text = re.sub("you'd", 'you would', text)
    text = re.sub("shouldn't", 'should not', text)
    text = re.sub("let's", 'let us', text)
    text = re.sub("they've", 'they have', text)
    text = re.sub("You'll", 'You will', text)
    text = re.sub("i'm", 'i am', text)
    text = re.sub("we've", 'we have', text)
    text = re.sub("it's", 'it is', text)
    text = re.sub("don't", 'do not', text)
    text = re.sub("that´s", 'that is', text)
    text = re.sub("I´m", 'I am', text)
    text = re.sub("it’s", 'it is', text)
    text = re.sub("she´s", 'she is', text)
    text = re.sub("he’s'", 'he is', text)
    text = re.sub('I’m', 'I am', text)
    text = re.sub('I’d', 'I did', text)
    text = re.sub("he’s'", 'he is', text)
    text = re.sub('there’s', 'there is', text)

    return text

In [None]:
dt = df['tweet'].apply(cleaning)

In [None]:
dt = pd.DataFrame(dt)
dt['sentiment'] = df['sentiment']

In [None]:
dt.head()

## Stopwords

In [None]:
dt['no_sw'] = dt['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
dt.head(20)

## Most frequent words

In [None]:
cnt = Counter()
for text in dt["no_sw"].values:
    for word in text.split():
        cnt[word] += 1
cnt.most_common(20)
temp = pd.DataFrame(cnt.most_common(20))
temp.columns = ['word', 'count']

In [None]:
temp.head(20)

most frequent words erstmal nicht entfernen (love, shit etc. vmtl. ausschlaggebend für Klassifikation)

In [None]:
# FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
# def remove_freqwords(text):
#     return " ".join([word for word in str(text).split() if word not in FREQWORDS])
# dt["no_sw_no_freqwo"] = dt["no_sw"].apply(lambda text: remove_freqwords(text))

In [None]:
# dt.head()

In [None]:
# dt[dt.no_sw != dt.no_sw_no_freqwo].head()

## Lemmatization

In [None]:
wordnet_lem = WordNetLemmatizer()
dt['no_sw_lem'] = dt['no_sw'].apply(wordnet_lem.lemmatize)

In [None]:
dt.head()

In [None]:
dt[dt.no_sw != dt.no_sw_lem].head(500)

## Check again: Duplicates / NaN

In [None]:
dt.duplicated().sum()

In [None]:
dt.drop_duplicates(inplace=True)
dt.duplicated().sum()

In [None]:
dt.isna().sum()

In [None]:
dt.info()

## Save preprocessed data

In [None]:
filepath = Path('data/twitter_training_cleaned_preprocessed.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)

In [None]:
dt.to_csv(filepath)