# Semisupervised Hackathon
## Simple ML Model

## 1. Load the libraries & dataset

In [37]:
from src.data.preprocessing import DataProcessor
from src.utils.util_funcs import get_tokens, load_nlp_model
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegressionCV,SGDClassifier
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import roc_auc_score 
from sklearn.semi_supervised import LabelPropagation
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import pandas as pd 
import spacy
import numpy as np

In [2]:
nlp = load_nlp_model()

In [6]:
def process_with_spacy(df):
    df['processed_text'] = df['text'].apply(lambda x: get_tokens(x, nlp=nlp, vectors=False))
    df['processed_text_vectors'] = df['text'].apply(lambda x: get_tokens(x, nlp=nlp, vectors=True))
    # In some cases, the processed text is empty (twitter grammar not the best one eheh), hence the if else statement
    df['avg_vector'] = df['processed_text_vectors'].apply(lambda x: np.mean(x, axis=0) if x else np.NaN)
    return df

#Read the dataset
train_data = pd.read_csv('data/train.csv')
train_data = process_with_spacy(train_data)

unlabelled_data = pd.read_csv('data/unlabelled_data.csv')
unlabelled_data = process_with_spacy(unlabelled_data)

test_data = pd.read_csv('data/test.csv')
test_data = process_with_spacy(test_data)

In [51]:
def series_of_list_to_df(series):
    #return np.concatenate(series.values, axis=0)
    return pd.DataFrame.from_dict(dict(zip(series.index, series.values))).to_numpy().T

X_train = series_of_list_to_df(train_data['avg_vector'])
y_train = train_data['target']

X_unlabelled = series_of_list_to_df(unlabelled_data['avg_vector'])
y_unlabelled = [-1 for _ in range(X_unlabelled.shape[0])]

X_mixed = np.concatenate((X_train, X_unlabelled), axis=0)
y_mixed = np.concatenate((y_train, y_unlabelled))

X_test = series_of_list_to_df(test_data['avg_vector'])

## 2. Create a simple model.

In [52]:
y_mixed.shape

(2178,)

In [60]:
%%time

imp = SimpleImputer()
lp = LabelPropagation()
clf = RandomForestClassifier()

X_mixed = imp.fit_transform(X_mixed)
lp.fit(X_mixed, y_mixed)
ymixed = lp.transduction_

clf.fit(X_mixed, y_mixed)

#xtrain = vec.fit_transform(xtrain)
#xtest = vec.transform(xtest)

# Create predictions
# train_probas = lp.predict_proba(X_train)[:,1]
# print(f"Train AUC : {roc_auc_score(y_train,train_probas)}")

X_test = imp.transform(X_test)
probas = clf.predict_proba(X_test)[:,1]

  self.label_distributions_ /= normalizer
CPU times: user 9.38 s, sys: 221 ms, total: 9.6 s
Wall time: 3.43 s


## 3. Create a submission

In [61]:
#Create a sample submission
sample_df = pd.DataFrame({
    'id':test_data.id,
    'target':probas
})
sample_df.to_csv('data/sample_submission.csv',index=False)

In [None]:
!kaggle competitions submit -c semi-supervised-3 -f data/sample_submission.csv -m "Latest submission"