# Semisupervised Hackathon
## Simple ML Model

## 1. Load the libraries & dataset

In [9]:
from src.data.preprocessing import DataProcessor
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegressionCV,SGDClassifier
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import roc_auc_score 
import pandas as pd 

In [10]:
#Read the dataset
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

X_train = train_data[['keyword','text']]
y_train = train_data[['target']]

X_test = test_data[['keyword','text']]

## 2. Create a simple model.

In [11]:
# Preprocess the dataset. 
dp = DataProcessor()
xtrain = dp.transform(X_train,data_cols=['keyword','text']) 
xtest = dp.transform(X_test,data_cols=['keyword','text'])

# Create a simple model.
vec = CountVectorizer(max_features=20)
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(xtrain.news, y_train.target)

# Create predictions
train_probas = pipe.predict_proba(xtrain.news)[:,1]
print(f"Train AUC : {roc_auc_score(y_train,train_probas)}")
probas = pipe.predict_proba(xtest.news)[:,1]


Train AUC : 1.0


## 3. Create a submission

In [12]:
#Create a sample submission
sample_df = pd.DataFrame({
    'id':test_data.id,
    'target':probas
})
sample_df.to_csv('data/sample_submission.csv',index=False)

In [14]:
!kaggle competitions submit -c semi-supervised-3 -f data/sample_submission.csv -m "Latest submission"