In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
train_df = pd.read_csv("train.csv")

In [3]:
test_df = pd.read_csv("test.csv")

In [4]:
X_train = train_df["title"].values
X_test = test_df["title"].values
y_train = train_df["target"].astype(int).values

In [5]:
vectorizer = CountVectorizer()

In [6]:
X_train_vectorized = vectorizer.fit_transform(X_train)

In [7]:
kf = KFold(n_splits=5,shuffle=False)
kf.split(X_train_vectorized)

X_train_base=[]
X_test_base=[]
Y_train_base=[]
Y_test_base=[]

for train_index, test_index in kf.split(X_train_vectorized):
    X_train_f, X_test_f = X_train_vectorized[train_index], X_train_vectorized[test_index]
    Y_train_f, Y_test_f = y_train[train_index], y_train[test_index]
    X_train_base.append(X_train_f)
    X_test_base.append(X_test_f)
    Y_train_base.append(Y_train_f)
    Y_test_base.append(Y_test_f)

In [8]:
accuracy_models = []

C_s = np.logspace(-3,3,7)

penalties = ['l1', 'l2']

In [9]:
%%time

max_score = 0

for c in C_s:
    for penalty in penalties:
        accuracy_model = []
        logreg = LogisticRegression(C=c, penalty=penalty, solver='liblinear')      
        # Train the model
        model = logreg.fit(X_train_base[0], Y_train_base[0])
        # Append to accuracy_model the accuracy of the model
        accuracy_model.append(accuracy_score(Y_test_base[0], model.predict(X_test_base[0]), normalize=True)*100)
        
        print("Score: ", accuracy_score(Y_test_base[0], model.predict(X_test_base[0]), normalize=True)*100, "C = ", c, " penalty = ", penalty)
        
        if(accuracy_score(Y_test_base[0], model.predict(X_test_base[0]), normalize=True)*100 > max_score):
            max_score = accuracy_score(Y_test_base[0], model.predict(X_test_base[0]), normalize=True)*100
            best_score_string = "Best score: ", accuracy_score(Y_test_base[0], model.predict(X_test_base[0]), normalize=True)*100, "C = ", c, " penalty = ", penalty
        
    accuracy_models.append(accuracy_model)

best_score_string

Score:  93.07146552361245 C =  0.001  penalty =  l1
Score:  91.41600768605424 C =  0.001  penalty =  l2
Score:  95.96482152095189 C =  0.01  penalty =  l1
Score:  95.6950705786712 C =  0.01  penalty =  l2
Score:  97.7311359101323 C =  0.1  penalty =  l1
Score:  97.7902594043308 C =  0.1  penalty =  l2
Score:  98.63646441504693 C =  1.0  penalty =  l1
Score:  98.6179883231099 C =  1.0  penalty =  l2
Score:  98.66972138053359 C =  10.0  penalty =  l1
Score:  98.80274924248023 C =  10.0  penalty =  l2
Score:  98.54038873697436 C =  100.0  penalty =  l1
Score:  98.75471140344393 C =  100.0  penalty =  l2
Score:  98.46278915083882 C =  1000.0  penalty =  l1
Score:  98.68819747247062 C =  1000.0  penalty =  l2
Wall time: 11.8 s


('Best score: ', 98.80274924248023, 'C = ', 10.0, ' penalty = ', 'l2')

In [10]:
logreg_tuned = LogisticRegression(C=10, penalty='l2', solver='liblinear')
logreg_tuned.fit(X_train_vectorized, y_train)
y_pred = logreg_tuned.predict(X_train_vectorized)

In [11]:
f1_score(y_train, y_pred)

0.9931042789605228

In [12]:
X_test_vectorized = vectorizer.transform(X_test)

test_df["target"] = logreg_tuned.predict(X_test_vectorized).astype(bool)

test_df[["id", "target"]].to_csv("ml_baseline.csv", index=False)