In [1]:
from utils.data_preparation import prepare_data
from sklearn.pipeline import Pipeline
from sklearn.linear_model import (
    LogisticRegression,
    SGDClassifier,
    RidgeClassifier,
    PassiveAggressiveClassifier,
)
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd

In [2]:
df = pd.read_csv("../data/feature_processed_data.csv")

df.head(5)

Unnamed: 0,blurb,category_name,category_parent_id,category_parent_name,country,creator_id,currency,deadline,goal,id,...,blurb_is_english,name_missing,name_len,name_lang,name_is_english,creator_prev_projects_successful,creator_prev_projects,project_duration_days,usd_goal_fx_log,category_name_reduced
0,A Year of Sanderson: Enjoy books and swag boxe...,Fiction,18,Publishing,US,74501917,USD,2022-03-31,1000000.0,1497949659,...,1,0,7,en,1,1,1,30,13.815512,Fiction
1,Color e-paper smartwatch with up to 7 days of ...,Product Design,7,Design,US,597507018,USD,2015-03-28,500000.0,1799979574,...,1,0,7,en,1,1,1,32,13.122365,Product Design
2,Beginning with The Stormlight Archive and expa...,Tabletop Games,12,Games,US,237961243,USD,2024-08-30,250000.0,7816448,...,1,0,4,en,1,11,11,24,12.42922,Tabletop Games
3,The COOLEST is a portable party disguised as a...,Product Design,7,Design,US,203090294,USD,2014-08-30,50000.0,342886736,...,1,0,8,en,1,0,1,53,10.819798,Product Design
4,Euro-inspired dungeon crawling sequel to the 2...,Tabletop Games,12,Games,US,1350948450,USD,2020-05-01,500000.0,374744378,...,1,0,1,en,1,4,4,31,13.122365,Tabletop Games


In [3]:
numeric_features = [
    "usd_goal_fx_log",
    "creator_prev_projects_successful",
    "creator_prev_projects",
    "project_duration_days",
    "blurb_len",
    "name_len",
]

categorical_features = [
    "category_name_reduced",
    "category_parent_name",
    "country",
    "currency",
    "launched_date_month",
    "deadline_month",
    "blurb_missing",
    "blurb_is_english",
    "name_missing",
    "name_is_english",
]

text_blurb = "blurb"
text_name = "name"

X_train, X_test, y_train, y_test = prepare_data(df, numeric_features, categorical_features, text_blurb, text_name, rand=None)

def fetch_preprocessor(numeric_features, categorical_features, text_blurb, text_name):
    numerical_pipeline = Pipeline([("scaler", StandardScaler())])

    categorical_pipeline = Pipeline(
        [("onehot", OneHotEncoder(handle_unknown="ignore"))]
    )

    text_pipeline = Pipeline(
        [("tfidf", TfidfVectorizer(max_features=5000, stop_words="english"))]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_pipeline, numeric_features),
            ("cat", categorical_pipeline, categorical_features),
            ("blurb", text_pipeline, text_blurb),
            ("name", text_pipeline, text_name),
        ],
        remainder="drop",
    )
    return preprocessor

preprocessor = fetch_preprocessor(
    numeric_features, categorical_features, text_blurb, text_name
)

In [None]:
logreg_param_grid = {
    "clf__C": [0.01, 0.1, 1.0, 3.0, 10.0],
    "clf__penalty": ["l2"],   # saga voisi käyttää myös 'l1' ja 'elasticnet'
    "clf__fit_intercept": [True],
    "clf__max_iter": [200, 500, 1000, 2000],
    "clf__solver": ["lbfgs", "saga"],
    "clf__class_weight": [None, "balanced"]
}

logreg_param_grid_saga_extra = {
    "solver": ["saga"],
    "penalty": ["elasticnet"],
    "C": [0.1, 1.0, 3.0],
    "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
    "max_iter": [500, 1000],
}

sgd_param_grid = {
    "clf__penalty": ["l2", "elasticnet"],
    "clf__alpha": [1e-4, 3e-4, 1e-3, 3e-3],
    "clf__learning_rate": ["optimal", "adaptive"],
    "clf__eta0": [0.001, 0.01, 0.1],
    "clf__l1_ratio": [0.1, 0.3, 0.5],
}

In [9]:
logreg_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("clf", LogisticRegression())
])

grid_logreg = GridSearchCV(
    estimator=logreg_pipe,
    param_grid=logreg_param_grid,
    scoring="f1",
    n_jobs=-1,
    cv=3,
    verbose=2
)

grid_logreg.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits




0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'clf__C': [0.01, 0.1, ...], 'clf__class_weight': [None, 'balanced'], 'clf__fit_intercept': [True], 'clf__max_iter': [200, 500, ...], ...}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'saga'
,max_iter,200


In [15]:
sgd_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("clf", SGDClassifier(loss="hinge"))
])

grid_sgd = GridSearchCV(
    estimator=sgd_pipe,
    param_grid=sgd_param_grid,
    scoring="f1",
    n_jobs=-1,
    cv=3,
    verbose=2
)

grid_sgd.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'clf__alpha': [0.0001, 0.0003, ...], 'clf__eta0': [0.001, 0.01, ...], 'clf__l1_ratio': [0.1, 0.3, ...], 'clf__learning_rate': ['optimal', 'adaptive'], ...}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,loss,'hinge'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.3
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [18]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

# LogReg
print("=== Logistic Regression ===")
print("Best params:", grid_logreg.best_params_)
print("Best CV f1:", grid_logreg.best_score_)

best_logreg = grid_logreg.best_estimator_

y_pred = best_logreg.predict(X_test)

# Perusmetriikat
acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)

print("\n=== Testisetin metriikat parhaalla LogReg-mallilla ===")
print(f"Accuracy : {acc:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")

# Tarkempi raportti
print("\nClassification report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:")
print(cm)
from sklearn.metrics import roc_auc_score

y_proba = best_logreg.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)
print("LogReg ROC-AUC:", roc_auc)

=== Logistic Regression ===
Best params: {'clf__C': 1.0, 'clf__class_weight': 'balanced', 'clf__fit_intercept': True, 'clf__max_iter': 200, 'clf__penalty': 'l2', 'clf__solver': 'saga'}
Best CV f1: 0.7421510328347812

=== Testisetin metriikat parhaalla LogReg-mallilla ===
Accuracy : 0.7640
F1-score : 0.7425
Precision: 0.7111
Recall   : 0.7769

Classification report:
              precision    recall  f1-score   support

           0       0.81      0.75      0.78     56935
           1       0.71      0.78      0.74     44364

    accuracy                           0.76    101299
   macro avg       0.76      0.77      0.76    101299
weighted avg       0.77      0.76      0.76    101299

Confusion matrix:
[[42931 14004]
 [ 9898 34466]]
LogReg ROC-AUC: 0.8556666360791173


In [17]:
# SGD hinge
print("\n=== SGD (hinge) ===")
print("Best params:", grid_sgd.best_params_)
print("Best CV f1:", grid_sgd.best_score_)

best_sgd = grid_sgd.best_estimator_


y_pred = best_sgd.predict(X_test)

# Perusmetriikat
acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)

print("\n=== Testisetin metriikat parhaalla SGD-mallilla ===")
print(f"Accuracy : {acc:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")

# Tarkempi raportti
print("\nClassification report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:")
print(cm)


=== SGD (hinge) ===
Best params: {'clf__alpha': 0.0001, 'clf__eta0': 0.1, 'clf__l1_ratio': 0.3, 'clf__learning_rate': 'optimal', 'clf__penalty': 'l2'}
Best CV f1: 0.7295412191102422

=== Testisetin metriikat parhaalla SGD-mallilla ===
Accuracy : 0.7613
F1-score : 0.7113
Precision: 0.7562
Recall   : 0.6715

Classification report:
              precision    recall  f1-score   support

           0       0.76      0.83      0.80     56935
           1       0.76      0.67      0.71     44364

    accuracy                           0.76    101299
   macro avg       0.76      0.75      0.75    101299
weighted avg       0.76      0.76      0.76    101299

Confusion matrix:
[[47330  9605]
 [14574 29790]]
