# Semisupervised Hackathon
## Simple ML Model

## 1. Load the libraries & dataset

In [None]:
from src.data.preprocessing import DataProcessor
from src.utils.util_funcs import get_tokens, load_nlp_model
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegressionCV,SGDClassifier
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import roc_auc_score 
import pandas as pd 
import spacy
import numpy as np

In [None]:
nlp = load_nlp_model()

In [None]:
#Read the dataset
train_data = pd.read_csv('data/train.csv')
train_data['processed_text'] = train_data['text'].apply(lambda x: get_tokens(x, nlp=nlp))
train_data['processed_text_vectors'] = train_data['text'].apply(lambda x: get_tokens(x, nlp=nlp, vectors=True))
# In some cases, the processed text is empty (twitter grammar not the best one eheh), hence the if else statement
train_data['avg_vector'] = train_data['processed_text_vectors'].apply(lambda x: np.mean(x, axis=0) if x else np.NaN)

test_data = pd.read_csv('data/test.csv')
test_data['processed_text'] = test_data['text'].apply(lambda x: get_tokens(x, nlp=nlp))
test_data['processed_text_vectors'] = test_data['text'].apply(lambda x: get_tokens(x, nlp=nlp, vectors=True))
test_data['avg_vector'] = test_data['processed_text_vectors'].apply(lambda x: np.mean(x, axis=0) if x else np.NaN)

X_train = train_data[['avg_vector']]
y_train = train_data[['target']]

X_test = test_data[['avg_vector']]

## 2. Create a simple model.

In [None]:
# Preprocess the dataset. 
dp = DataProcessor()
xtrain = dp.transform(X_train,data_cols=['keyword','text']) 
xtest = dp.transform(X_test,data_cols=['keyword','text'])

# Create a simple model.
vec = CountVectorizer(max_features=20)
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(xtrain.news, y_train.target)

# Create predictions
train_probas = pipe.predict_proba(xtrain.news)[:,1]
print(f"Train AUC : {roc_auc_score(y_train,train_probas)}")
probas = pipe.predict_proba(xtest.news)[:,1]


## 3. Create a submission

In [None]:
#Create a sample submission
sample_df = pd.DataFrame({
    'id':test_data.id,
    'target':probas
})
sample_df.to_csv('data/sample_submission.csv',index=False)

In [None]:
!kaggle competitions submit -c semi-supervised-3 -f data/sample_submission.csv -m "Latest submission"