# Semisupervised Hackathon
## Simple ML Model

## 1. Load the libraries & dataset

In [None]:
from src.data.preprocessing import DataProcessor
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegressionCV,SGDClassifier
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import roc_auc_score 
import pandas as pd 


In [None]:
#1.Read the dataset
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

X_train = train_data[['title','text']]
y_train = train_data[['label']]

X_test = test_data[['title','text']]

## 2. Create a simple model.

In [None]:
# Preprocess the dataset. 
dp = DataProcessor(replace_digits=True, max_news_size=500)
xtrain = dp.transform(X_train,data_cols=['title','text']) 
xtest = dp.transform(X_test,data_cols=['title','text'])

# Create a simple model.
vec = TfidfVectorizer()
clf = LogisticRegressionCV(max_iter=1000)
pipe = make_pipeline(vec, clf)
pipe.fit(xtrain.news, y_train.label)

# Create predictions
train_probas = pipe.predict_proba(xtrain.news)[:,1]
print(f"Train AUC : {roc_auc_score(y_train,train_probas)}")
probas = pipe.predict_proba(xtest.news)[:,1]

## 3. Create a submission

In [None]:
#Create a sample submission
sample_df = pd.DataFrame({
    'id':test_data.id,
    'label':probas
})
sample_df.to_csv('data/sample_submission.csv',index=False)

In [None]:
!kaggle competitions submit -c semi-supervised-2 -f data/sample_submission.csv -m "Latest submission"