# Ideas

- LabelSpreading instead of LabelPropagation
- active learning layer after label propagation (but how to score without using kaggle?)
- tree model after label propagation? 

# Semisupervised Hackathon
## Simple ML Model

## 1. Load the libraries & dataset

In [None]:
from src.data.preprocessing import DataProcessor, DenseTransformer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegressionCV,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import roc_auc_score 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.semi_supervised import LabelPropagation
import numpy as np
import pandas as pd 

In [None]:
#Read the dataset
train_data = pd.read_csv('data/train.csv')
unlabelled_data = pd.read_csv('data/unlabelled_data.csv')
test_data = pd.read_csv('data/test.csv')

X_train = train_data[['keyword','text']]
y_train = train_data[['target']]

X_unlabelled = unlabelled_data[['keyword','text']]

X_test = test_data[['keyword','text']]

## 2. Create a simple model.

In [None]:
%%time
# Preprocess the dataset. 
dp = DataProcessor()
xtrain = dp.transform(X_train,data_cols=['keyword','text']).news
xunlabelled = dp.transform(X_unlabelled,data_cols=['keyword','text']).news
xtest = dp.transform(X_test,data_cols=['keyword','text']).news
ytrain = y_train.target.copy()
yunlabelled = [-1 for _ in range(len(xunlabelled))]

# param_grid = [
#         {'bootstrap': [False, True],
#          'n_estimators': [10, 50],
#          'max_depth': [2, 10],
#          'min_samples_leaf': [2, 10],
#          'min_samples_split': [2, 10]
#         },
#     ]

# Create a simple model.
vec = CountVectorizer(max_features=20)
lp = LabelPropagation()
clf = RandomForestClassifier()
# grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc', refit=True)

xtrain = vec.fit_transform(xtrain).toarray()
xunlabelled = vec.fit_transform(xunlabelled).toarray()
xtest = vec.transform(xtest).toarray()

xmixed = np.concatenate((xtrain, xunlabelled))
ymixed = np.concatenate((ytrain, yunlabelled))

lp.fit(xmixed, ymixed)
ymixed = lp.transduction_

clf.fit(xmixed, ymixed)

#xtrain = vec.fit_transform(xtrain)
#xtest = vec.transform(xtest)

# Create predictions
train_probas = lp.predict_proba(xtrain)[:,1]
print(f"Train AUC : {roc_auc_score(y_train,train_probas)}")
probas = lp.predict_proba(xtest)[:,1]

## 3. Create a submission

In [None]:
#Create a sample submission
sample_df = pd.DataFrame({
    'id':test_data.id,
    'target':probas
})
sample_df.to_csv('data/sample_submission.csv',index=False)

In [None]:
!kaggle competitions submit -c semi-supervised-3 -f data/sample_submission.csv -m "Latest submission"