In [None]:
import pandas as pd
import numpy as np
import gensim

from helper.data_loading import *
from helper.preprocessing import *
from helper.cross_val_model import *
from helper.classifier_helper import *

pd.set_option('display.max_colwidth', -1)

In [None]:
germEval = pd.read_pickle('data/GERMEVAL_with_topic_distribution.pkl')
germEval = germEval.rename(columns={'label_1': 'label'})

germEval_topic_distribution = get_topic_distribution_over_dataset(germEval.loc[germEval.label=="OTHER",:])
print("germEval topic distribution:\n",germEval_topic_distribution, "\n")

In [None]:
def prepare_data(data):
    print("start remove_hand_selected_words")
    data["token"] = data.apply(lambda x: remove_hand_selected_words(x["text"]), axis=1)
    
    print("start rermove_repeating_chars")
    data["token"] = data.apply(lambda x: rermove_repeating_chars(x["token"]), axis=1)
    
    print("start emoji_2_text")
    data["token"] = data.apply(lambda x: emoji_2_text(x["token"]), axis=1)
    
    print("start ekphrasis")
    data["token"] = data.apply(lambda x: " ".join(tw_process.pre_process_doc(x["token"])), axis=1)
    
    print("start remove_special_chars")
    data["token"] = data.apply(lambda x: remove_special_chars(x["token"]), axis=1) #[r"[^A-Za-z0-9\säüßöÖÄÜ<>_:!?.,\-]+
          
    print("start lower")
    data["token"] = data.apply(lambda x: x["token"].lower(), axis=1) 
    
    print("start sentence_to_token")
    data["token"] = data.apply(lambda x: sentence_to_token(x["token"]), axis=1)

    return(data)


In [None]:
left = pd.read_pickle('data/LEFT_with_topic_distribution.pkl')
left_pool = create_sample_pool(left, germEval_topic_distribution, sample_factor = 5)

left = create_data_from_pool(left_pool, germEval_topic_distribution)    
left = concat_germEval_and_Other(germEval, left)
left = prepare_data(left)    
left = replace_label_to_binary(left)

del left_pool

tweets = np.array(left["token"].tolist())
labels = np.array(left["label"].tolist())

scores_model = perform_cross_validation(folds=3,
                                 tweets=tweets,
                                 labels=labels,
                                 print_fold_eval=True)
del left

In [None]:
right = pd.read_pickle('data/RIGHT_with_topic_distribution.pkl')
right_pool = create_sample_pool(right, germEval_topic_distribution, sample_factor = 5)

right = create_data_from_pool(right_pool, germEval_topic_distribution)    
right = concat_germEval_and_Other(germEval, right)
right = prepare_data(right)    
right = replace_label_to_binary(right)

del right_pool

tweets = np.array(right["token"].tolist())
labels = np.array(right["label"].tolist())

scores_model = perform_cross_validation(folds=3,
                                 tweets=tweets,
                                 labels=labels,
                                 print_fold_eval=True)

del right

In [None]:
neutral = pd.read_pickle('data/NEUTRAL_with_topic_distribution.pkl')
neutral_pool = create_sample_pool(neutral, germEval_topic_distribution, sample_factor = 5)

neutral = create_data_from_pool(neutral_pool, germEval_topic_distribution)    
neutral = concat_germEval_and_Other(germEval, neutral)
neutral = prepare_data(neutral)    
neutral = replace_label_to_binary(neutral)

del neutral_pool

tweets = np.array(neutral["token"].tolist())
labels = np.array(neutral["label"].tolist())

scores_model = perform_cross_validation(folds=3,
                                 tweets=tweets,
                                 labels=labels,
                                 print_fold_eval=True)

del neutral

In [None]:
germEval = prepare_data(germEval)    
germEval = replace_label_to_binary(germEval)

tweets = np.array(germEval["token"].tolist())
labels = np.array(germEval["label"].tolist())

scores_model = perform_cross_validation(folds=3,
                                 tweets=tweets,
                                 labels=labels,
                                 print_fold_eval=True)


In [None]:
germEval.head()

In [None]:
germEval = pd.read_pickle('data/GERMEVAL_with_topic_distribution.pkl')

In [None]:
germEval.head()

In [None]:
germEval.groupby("label_1").count()

In [None]:
germEval.shape

In [None]:
#load dataset germEval 2019
#emojies are not decoded properly
germeval2019_subtask1_2_train = pd.read_csv('data/germEval2019/germeval2019.training_subtask1_2_korrigiert.txt',
            sep = "\t",encoding="utf-8",quoting=csv.QUOTE_NONE ,
            names=['text','label_1','label_2'])

germeval2019_subtask1_2_train["origin"] = "train"
germeval2019_subtask1_2_train["year"] = "19"

#emojies are not decoded properly
germeval2019_subtask1_2_test = pd.read_csv('data/germEval2019/germeval2019GoldLabelsSubtask1_2.txt',
            sep = "\t",encoding="utf-8",quoting=csv.QUOTE_NONE ,
            names=['text','label_1','label_2'])

germeval2019_subtask1_2_test["origin"] = "test"
germeval2019_subtask1_2_test["year"] = "19"

#load dataset germEval 2018
germEval2018_train = pd.read_csv('data/germEval2018/germeval2018.training.txt',
            sep = "\t",encoding="utf-8",quoting=csv.QUOTE_NONE ,
            names=['text','label_1','label_2'])

germEval2018_train["origin"] = "train"
germEval2018_train["year"] = "18"

germEval2018_test = pd.read_csv('data/germEval2018/germeval2018.test.txt',
            sep = "\t",encoding="utf-8",quoting=csv.QUOTE_NONE ,
            names=['text','label_1','label_2'])

germEval2018_test["origin"] = "test"
germEval2018_test["year"] = "18"

df = pd.concat([germeval2019_subtask1_2_train,
           germeval2019_subtask1_2_test,
           germEval2018_train,
           germEval2018_test])

#remove duplicate tweets, due concatinating different datasets together
df = df.drop_duplicates()
df = df.sample(frac=1,random_state=1993).reset_index(drop=True)


In [None]:
df.head()

In [None]:
df.groupby(["label_1","origin","year"]).count()

In [None]:
df.groupby(["year"]).count()

In [None]:
1688	1287
1202	970
3321	2708
2330	2061


8541	7026

In [None]:
1688+1287+1202+970

In [None]:
3321+2708+2330+2061