# Modeling

### Poging 2 
K-fold - random state 259

In [None]:
import numpy as np
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn import tree
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeCV
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    mean_absolute_error,
    r2_score,
    root_mean_squared_error,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold, 
    cross_val_score,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.svm import SVC

In [None]:
x_train = pd.read_csv("xtrain.csv")
y_train = pd.read_csv("ytrain.csv")

## Preprocesser

### stopwords / datacleaning


In [None]:
stop_words = set(stopwords.words("english"))

extra_stopwords = {
    "well", "way", "said", "see", "say", "n",
    "need", "going", "got", "s", "n't", "told",
    "u", "help", "call", "got", "still", "still",
    "either", "trying", "tell", "use", "gave", "fact"
}

stop_words |= extra_stopwords 

In [None]:
lemmatizer = WordNetLemmatizer()

def remove_punctuation(text: str) -> str:
    return re.sub(r"[^\w\s]", "", text)

def has_multiple_x(word: str) -> bool:
    """Filter woorden met meer dan 1 'x' (case-insensitive)."""
    return word.lower().count("x") > 1

def data_preprocessor(text: str) -> str:
    text = text.lower()
    text = remove_punctuation(text)
    return text

def data_tokenizer(text: str):
    tokens = word_tokenize(text)

    tokens = [
        t for t in tokens
        if t not in stop_words
        and not has_multiple_x(t)
        and len(t) > 2                # korte troep verwijderen
        and not t.isdigit()           # cijfers eruit
    ]

    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

### Pipeline

In [None]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=False,
        preprocessor=data_preprocessor,
        tokenizer=data_tokenizer,
        analyzer="word",
        stop_words=None,
        token_pattern=None
    )),
    ("clf", SVC(kernel="linear", probability=True))
])

### K-Fold

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=259)

scores = cross_val_score(
    pipeline,
    x_train,          
    y_train,
    cv=cv,
    scoring="accuracy"
)

print("CV accuracy per fold:", scores)
print("Gemiddelde accuracy:", scores.mean())

TypeError: '<' not supported between instances of 'str' and 'int'