# NLP Project - Leichte Sprache

## Pickle Save/Load

In [None]:
# import picke file
import pickle

def save_object(object_to_save):
  with open('pickled_data.pkl', 'wb') as file:
     pickle.dump(object_to_save, file)

def load_object(file_name_to_load):
  with open(file_name_to_load, 'rb') as file:
     obj = pickle.load(file)
  return obj

#save_object(df)
df = load_object('pickled_data.pkl')

## Install spacy dependencies

In [42]:
!pip install --upgrade spacy
!pip install spacy-transformers
!python -m spacy download de_dep_news_trf

Requirement already up-to-date: spacy in /usr/local/lib/python3.7/dist-packages (3.0.6)
2021-06-15 17:29:36.671260: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_dep_news_trf')


## Main Code

### Prepare Processing

In [None]:
import spacy
import pandas as pd

In [None]:
import io

df_culture = pd.read_csv('./data/Kultur_normal.csv').drop(['Line_ID', 'year', 'month', 'day'], axis=1)
df_culture['category'] = 'Kultur'

df_sport = pd.read_csv('./data/Sport_normal.csv').drop(['Line_ID', 'year', 'month', 'day'], axis=1)
df_sport['category'] = 'Sport'

df_politic = pd.read_csv('./data/Politik_normal.csv').drop(['Line_ID', 'year', 'month', 'day'], axis=1)
df_politic['category'] = 'Nachrichten'

df_not_leichte_sprache = pd.concat([df_culture, df_sport, df_politic])
df_not_leichte_sprache['is_leichte_sprache'] = 0


df_leichte_sprache = pd.read_csv('./data/leicht_nachricht.csv').drop(['audio_link', 'Line_ID', 'year', 'month', 'day'], axis=1)
df_leichte_sprache = df_leichte_sprache[df_leichte_sprache['category'] != 'Vermischtes']
df_leichte_sprache['is_leichte_sprache'] = 1

df = pd.concat([df_not_leichte_sprache, df_leichte_sprache])
df = df.reset_index(drop=True)

In [None]:
import re
import string

def data_cleansing(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\r', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

df['article'] = df['article'].apply(lambda x: data_cleansing(str(x)))
df['kurz_text'] = df['kurz_text'].apply(lambda x: data_cleansing(str(x)))
df['haupt_text'] = df['haupt_text'].apply(lambda x: data_cleansing(str(x)))
df.head()

Unnamed: 0,category,article,kurz_text,haupt_text,is_leichte_sprache
0,Kultur,Gerhard Richter erklärt Kirchenfenster zu sein...,Im Kloster Tholey werden diese Woche neue Fen...,Seine abstrakten Bilder werden in den wichtig...,0
1,Kultur,Das sind unsere Buchempfehlungen für die Ferien,"Alberne Eltern, fliegende Brötchen, Trolle un...","Egal wie heiß es ist, auf Bücher ist Verlass....",0
2,Kultur,Grauenhafte Leerstelle,"War der Autor des Welterfolgs ""Alice im Wunde...","Unstrittig ist, dass ""Alice im Wunderland"" ni...",0
3,Kultur,HBO Max erteilt Deutschland für 2021 eine Absage,WarnerMedia kommt mit seiner Streamingplattfo...,Deutsche Film- und Serienfans werden auch kün...,0
4,Kultur,Javicia Leslie ist die neue Batwoman,Vor zwei Monaten stieg Ruby Rose überraschend...,Sie wurde unter anderem als erste homosexuell...,0


In [None]:
def preprocess(text):
    text = general_preprocessing(text)
    return text

def general_preprocessing(text):
    ## text to lower
    #text = text.lower()
    ## remove numbers
    # text = re.sub(r'\d+', '', input_str)
    ## remove punctuation
    # text = text.translate(string.maketrans('',''), string.punctuation)
    ## remove whitespaces
    #text = re.sub(r'\n', ' ', text)
    #text = re.sub(r'\r', ' ', text)
    #text = re.sub(r'\s+', ' ', text)
    return text

df['article'] = df['article'].apply(lambda x: preprocess(str(x)))
df['kurz_text'] = df['kurz_text'].apply(lambda x: preprocess(str(x)))
df['haupt_text'] = df['haupt_text'].apply(lambda x: preprocess(str(x)))

### Feature Extraction

In [None]:
# problem with batch size of doc vectors -> sometimes 1x768, sometimes 2x768 (depending on number of batches)
def feature_extraction(row, nlp):
    feature = nlp(row['haupt_text'])._.trf_data.tensors[-1]
    return feature

nlp = spacy.load('de_dep_news_trf')
df['feature'] = df.apply(lambda row: feature_extraction(row, nlp), axis=1)
df.head()

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize(train_data, test_data):
    vectorizer = TfidfVectorizer()
    train_tfidf = vectorizer.fit_transform(train_data).toarray()
    test_tfidf = vectorizer.transform(test_data).toarray() 
    return train_tfidf, test_tfidf

train_x, test_x, train_y, test_y = train_test_split(df['haupt_text'], df['is_leichte_sprache'], train_size=0.7, random_state=0)
train_vec, test_vec = vectorize(train_x, test_x)

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

def vectorize(train_data, test_data):
    vectorizer = CountVectorizer()
    train_tfidf = vectorizer.fit_transform(train_data).toarray()
    test_tfidf = vectorizer.transform(test_data).toarray() 
    return train_tfidf, test_tfidf

train_x, test_x, train_y, test_y = train_test_split(df['haupt_text'], df['is_leichte_sprache'], train_size=0.7, random_state=0)
train_vec, test_vec = vectorize(train_x, test_x)

In [None]:
# bad code

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

#train_x, test_x, train_y, test_y = train_test_split(df['feature'].apply(lambda x: x.squeeze()), df['category'], train_size=0.7, random_state=0)

### Classification

In [67]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

clf = MLPClassifier()
clf.fit(train_vec, train_y)

predictions = clf.predict(test_vec)

print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2160
           1       1.00      0.98      0.99      1381

    accuracy                           0.99      3541
   macro avg       0.99      0.99      0.99      3541
weighted avg       0.99      0.99      0.99      3541



In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

clf = DecisionTreeClassifier()
clf.fit(train_vec, train_y)

predictions = clf.predict(test_vec)

print(classification_report(test_y, predictions))