In [1]:
import pandas as pd

In [2]:
de_train = pd.read_csv("./train_dev_test_splits/de.train.csv", sep="\t")
de_val = pd.read_csv("./train_dev_test_splits/de.valid.csv", sep="\t")
de_test = pd.read_csv("./train_dev_test_splits/de.test.csv", sep="\t")

fr_train = pd.read_csv("./train_dev_test_splits/fr.train.csv", sep="\t")
fr_val = pd.read_csv("./train_dev_test_splits/fr.valid.csv", sep="\t")
fr_test = pd.read_csv("./train_dev_test_splits/fr.test.csv", sep="\t")


## Spacy Pipeline - Tokenization

In [3]:
import spacy

nlp_de = spacy.load("de_core_news_sm")
nlp_fr = spacy.load("fr_core_news_sm")

In [5]:
import regex
import emoji
from nltk.corpus import stopwords

def processor(data, lang, nlp):
    reg = '[^a-zA-Z0-9 àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]' if lang == 'fr' else '[^a-zA-Z0-9 äöüßÄÖÜẞ]'
    sw = set(stopwords.words('french' if lang=='fr' else 'german'))
    corpus = []
    for i in range(0, data['content'].size):
        text = regex.sub(r'<U\+([0-9a-fA-F]+)>', lambda m: chr(int(m.group(1),16)), data['content'][i])
        text = emoji.demojize(text, language=lang)
        # get review and remove non alpha chars
        text = regex.sub(reg, ' ', text)
        text = text.lower()
        # split into tokens, apply stemming and remove stop words
        text = ' '.join([t.text for t in nlp(text)])
        corpus.append(text)

    return corpus




## TF-IDF Vectorizer

In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = processor(fr_train, 'fr', nlp_fr) + processor(fr_val, 'fr', nlp_fr) + processor(fr_test, 'fr', nlp_fr)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.shape)

y = np.concatenate((fr_train["e1"].values, fr_val["e1"].values, fr_test["e1"].values))
print(y.shape)


(3678, 16343)
(3678,)


## French DataSet

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [16]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_pred)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

## German Dataset

In [17]:
corpus = processor(de_train, 'de', nlp_de) + processor(de_val, 'de', nlp_de) + processor(de_test, 'de', nlp_de)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.shape)

y = np.concatenate((de_train["e1"].values, de_val["e1"].values, de_test["e1"].values))
print(y.shape)

(4306, 22102)
(4306,)


In [18]:
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_pred)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [7]:
import numpy as np

def text_to_mean_vector(embeddings, text):
    tokens = text.split()
    
    # convert tokens to embedding vectors, up to sequence_len tokens
    vec = []
    i = 0
    while i < len(tokens):   # while there are tokens and did not reach desired sequence length
        try:
            vec.append(embeddings.get_vector(tokens[i]))
        except KeyError:
            True   # simply ignore out-of-vocabulary tokens
        finally:
            i += 1
    
    # add blanks up to sequence_len, if needed
    vec = np.mean(vec, axis=0)
    return vec

def text_to_vector(embeddings, text, sequence_len):
    
    # split text into tokens
    tokens = text.split()
    
    # convert tokens to embedding vectors, up to sequence_len tokens
    vec = []
    n = 0
    i = 0
    while i < len(tokens) and n < sequence_len:   # while there are tokens and did not reach desired sequence length
        try:
            vec.extend(embeddings.get_vector(tokens[i]))
            n += 1
        except KeyError:
            True   # simply ignore out-of-vocabulary tokens
        finally:
            i += 1
    
    # add blanks up to sequence_len, if needed
    for j in range(sequence_len - n):
        vec.extend(np.zeros(embeddings.vector_size,))
    
    return vec

## Embeddings pfv

In [8]:
from scipy import stats

corpus = processor(de_train, 'de', nlp_de) + processor(de_val, 'de', nlp_de) + processor(de_test, 'de', nlp_de)
lens = [len(c.split()) for c in corpus]
print(np.min(lens), np.max(lens), np.mean(lens), np.std(lens), stats.mode(lens))

1 296 58.20274036228518 43.038069559552135 ModeResult(mode=array([17]), count=array([71]))


  print(np.min(lens), np.max(lens), np.mean(lens), np.std(lens), stats.mode(lens))


In [10]:
print(corpus[0])

und schreiben ihren      deutschland wird islamischer   was verstehen sie darunter   ja   eine multikulturelle gesellschaft beherbergt menschen   unterschiedlicher religionen und farben und unter ihnen wird es konservative und liberale gläubige geben   was hat das mit dem eigentlichen thema zu tun  


In [12]:
corpus_fr = processor(fr_train, 'fr', nlp_fr) + processor(fr_val, 'fr', nlp_fr) + processor(fr_test, 'fr', nlp_fr)
lens = [len(c.split()) for c in corpus_fr]
print(np.min(lens), np.max(lens), np.mean(lens), np.std(lens), stats.mode(lens))

1 642 41.43637846655791 45.65519818612259 ModeResult(mode=array([16]), count=array([98]))


  print(np.min(lens), np.max(lens), np.mean(lens), np.std(lens), stats.mode(lens))
