# Ideas

- LabelSpreading instead of LabelPropagation
- active learning layer after label propagation (but how to score without using kaggle?)
- tree model after label propagation? 

# Semisupervised Hackathon
## Simple ML Model

## 1. Load the libraries & dataset

In [None]:
from src.data.preprocessing import DataProcessor, DenseTransformer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegressionCV,SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import roc_auc_score 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.semi_supervised import LabelPropagation

import numpy as np
import pandas as pd 

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

from IPython import display
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
#Read the dataset
train_data = pd.read_csv('data/train.csv')
unlabelled_data = pd.read_csv('data/unlabelled_data.csv')
test_data = pd.read_csv('data/test.csv')

X_train = train_data[['keyword','text']]
y_train = train_data[['target']]

X_unlabelled = unlabelled_data[['keyword','text']]

X_test = test_data[['keyword','text']]

In [None]:
sample = X_unlabelled['text'].sample(20)
idx_list = sample.index
label_list = []

for entry in sample:
    print(entry)
    label_list.append(int(input()))

In [None]:
idx_list

In [None]:
label_list

In [None]:
x= [46, 47, 51, 58, 189, 192, 200, 202, 214, 850, 855, 861, 862, 1529, 1541, 1542, 2449, 2477, 2484]
y = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0]

In [None]:
#Read the dataset
train_data = pd.read_csv('data/train.csv')
unlabelled_data = pd.read_csv('data/unlabelled_data.csv')
test_data = pd.read_csv('data/test.csv')

X_train = train_data[['keyword','text']]
y_train = train_data[['target']]

X_unlabelled = unlabelled_data[['keyword','text']]

X_test = test_data[['keyword','text']]

manually_labelled_idx = [ 206, 1659, 1234, 1793,  557, 1720,  391,  852,  680,  637, 2122,
            1173, 1478,  739,  633, 2147,   18, 1107, 1262,  765, 45, 1700,  697, 2115,  953, 1865,  823,  605, 1561, 1224, 1294,
            1149, 1211, 2128,  927, 1452,  291, 1185,  842,  423]
manually_labelled_y = pd.DataFrame([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0], columns=['target'])

X_train_new = pd.concat((X_train, X_unlabelled.iloc[manually_labelled_idx]))
y_train_new = pd.concat((y_train, manually_labelled_y))

In [None]:
len(y_train_new)

In [None]:
X_train_new.shape

## 2. Create a simple model.

In [None]:
%%time
# Preprocess the dataset. 
dp = DataProcessor()
xtrain = dp.transform(X_train,data_cols=['keyword','text']).news
xunlabelled = dp.transform(X_unlabelled,data_cols=['keyword','text']).news
xtest = dp.transform(X_test,data_cols=['keyword','text']).news
ytrain = y_train.target.copy()
yunlabelled = [-1 for _ in range(len(xunlabelled))]

# Create a simple model.
vec = CountVectorizer(max_features=20)

xtrain = vec.fit_transform(xtrain).toarray()
xunlabelled = vec.fit_transform(xunlabelled).toarray()
xtest = vec.transform(xtest).toarray()

In [None]:
learner = ActiveLearner(
    estimator=RandomForestClassifier(),
    query_strategy=uncertainty_sampling,
    X_training=xtrain, y_training=ytrain.to_numpy()
)

n_queries = 20
X_pool_text = unlabelled_data['text'].copy()
X_pool = xunlabelled.copy()

for i in range(n_queries):
    query_idx, query_inst = learner.query(X_pool)

    print(X_pool_text.iloc[query_idx].values)

    print("Which target does this have?")
    y_new = np.array([int(input())], dtype=int)
    learner.teach(X_pool[query_idx], y_new)

    xunlabelled = np.delete(xunlabelled, query_idx, axis=0)

In [None]:
probas = learner.predict_proba(xtest)[:,1]

In [None]:
probas

## 3. Create a submission

In [None]:
#Create a sample submission
sample_df = pd.DataFrame({
    'id':test_data.id,
    'target':probas
})
sample_df.to_csv('data/sample_submission.csv',index=False)

In [None]:
!kaggle competitions submit -c semi-supervised-3 -f data/sample_submission.csv -m "Latest submission"