In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import re
import string

%matplotlib inline

In [2]:
train_df = pd.read_parquet('./data/train.parquet')
test_df = pd.read_parquet('./data/test.parquet')
submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
labels = train_df.target
data = train_df.Title

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=.1, stratify=labels, random_state=0)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_transformer = TfidfVectorizer().fit(x_train)
x_train_tfidf = tfidf_transformer.transform(x_train)

In [7]:
x_val_tfidf = tfidf_transformer.transform(x_val)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

for C in [1e-2, 1e-1, 1, 10, 100]:
    lr = LogisticRegression(C=C, max_iter=1000, multi_class='ovr', random_state=0).fit(x_train_tfidf, y_train)
    prediction = lr.predict(x_val_tfidf)
    prob = lr.predict_proba(x_val_tfidf)
    print(f'C = {C}')
    print(f'\taccuracy: {accuracy_score(y_val, prediction)}')
    print(f'\troc-auc: {roc_auc_score(y_val, prob, multi_class="ovr")}')

C = 0.01
	accuracy: 0.5627083333333334
	roc-auc: 0.7560393880208333
C = 0.1
	accuracy: 0.5972916666666667
	roc-auc: 0.7846731770833334
C = 1
	accuracy: 0.6154166666666666
	roc-auc: 0.8010085937500001
C = 10
	accuracy: 0.6104166666666667
	roc-auc: 0.7933137369791666
C = 100
	accuracy: 0.5925
	roc-auc: 0.7728712239583334


In [10]:
tfidf_transformer = TfidfVectorizer().fit(data)
x_tfidf = tfidf_transformer.transform(data)

In [11]:
x_test_tfidf = tfidf_transformer.transform(test_df.Title)

In [12]:
lr = LogisticRegression(C=5, max_iter=1000, multi_class='ovr').fit(x_tfidf, labels)

In [13]:
prediction = lr.predict(x_test_tfidf)

In [14]:
submission = pd.DataFrame({'Id': test_df.index, 'Predicted': prediction})

In [15]:
submission.to_csv('submissions/baseline1.csv', index=False)