<a href="https://colab.research.google.com/github/mkrsteska/BSA2020_Team_Tissot_Project_2/blob/master/code/Universal%20Sentence%20Encoder%20and%20SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**In this notebook we have the model that achieved the best accuracy on Kaggle.**

**The other models we have tried are in separate notebooks, that can be found in the GitHub repository.**

### GitHub repository ###
[GitHub](https://github.com/mkrsteska/BSA2020_Team_Tissot_Project_2)

### Link to our video ###
[Video](https://www.youtube.com/watch?v=WDVuDgq4BTc)

### Kaggle Ranking ###
![Kaggle ranking](https://raw.githubusercontent.com/mkrsteska/BSA2020_Team_Tissot_Project_2/master/data/Kaggle_ranking.png)


In [0]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from preprocess_tweets import preprocess_tweet_use
from sklearn.model_selection import GridSearchCV

## Load Universal Sentence Encoder 


In [0]:
# Load Universal Sentence Encoder
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [0]:
# Read the data 
df_train = pd.read_csv("https://raw.githubusercontent.com/mkrsteska/BSA2020_Team_Tissot_Project_2/master/data/train.csv")
df_test = pd.read_csv("https://raw.githubusercontent.com/mkrsteska/BSA2020_Team_Tissot_Project_2/master/data/test.csv")

## Model #1

In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_train = []
for r in tqdm(df_train.text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(text_emb)
  
X_train = np.array(X_train)
y_train = df_train.target.values

100%|██████████| 7613/7613 [03:00<00:00, 42.28it/s]


In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_test = []
for r in tqdm(df_test.text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(text_emb)

X_test = np.array(X_test)

100%|██████████| 3263/3263 [01:14<00:00, 43.89it/s]


In [0]:
# Parameter estimation using grid search with cross-validation

Cs = [1, 1.01, 1.03, 1.05, 1.07, 1.09, 1.1, 1.3, 1.5, 1.7, 1.9, 2]
#Cs = [1.070, 1.074, 1.075, 1.1, 1.125]
gammas = [2.01, 2.03, 2.05, 2.07, 2.09]
#gammas = [2.065, 2.075, 2.08]
param_grid = {'C': Cs, 'gamma' : gammas}
grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv = 5, n_jobs=8)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'C': 1, 'gamma': 2.09}

In [0]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=37)

In [0]:
model_3 = SVC(kernel='rbf', C=1, gamma=2.09, probability=True)

model_3.fit(X_train, y_train)

model_3.score(X_val, y_val)

0.8530183727034121

**Create submission files**

In [0]:
ids = df_test['id'].to_numpy()

In [0]:
predictions_3 = model_3.predict(X_test)
pd.DataFrame({'id': ids, 'target': predictions_3}).to_csv('14. Submission_SVC.csv', index=False)

**Score** 0.82719

## Model #2

#### Correct the mislabeled samples

In [0]:
# Correct the mislabeled samples
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
df_train.loc[df_train['id'].isin(ids_with_target_error),'target'] = 0

In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_train = []
for r in tqdm(df_train.text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(text_emb)
  
X_train = np.array(X_train)
y_train = df_train.target.values

100%|██████████| 7613/7613 [03:01<00:00, 41.89it/s]


In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_test = []
for r in tqdm(df_test.text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(text_emb)

X_test = np.array(X_test)

100%|██████████| 3263/3263 [01:14<00:00, 43.61it/s]


In [0]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=37)

In [0]:
# After correcting the mislabeled samples
model_4 = SVC(kernel='rbf', C=1, gamma=2.09, probability=True)

model_4.fit(X_train, y_train)

model_4.score(X_val, y_val)

0.8503937007874016

**Create a submission file**

In [0]:
predictions_4 = model_4.predict(X_test)
pd.DataFrame({'id': ids, 'target': predictions_4}).to_csv('15. Submission_SVC.csv', index=False)

**Score** 0.82719

## Model #3

#### Universal Sentence Encoder with preprocessed tweets

In [0]:
train_text = df_train.text.apply(preprocess_tweet_use)
test_text = df_test.text.apply(preprocess_tweet_use)
y_train = df_train.target

In [22]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_train = []
for r in tqdm(train_text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(text_emb)
X_train = np.array(X_train)
y_train = df_train.target.values

100%|██████████| 7613/7613 [02:53<00:00, 43.94it/s]


In [23]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_test = []
for r in tqdm(test_text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(text_emb)

X_test = np.array(X_test)

100%|██████████| 3263/3263 [01:13<00:00, 44.48it/s]


In [0]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=37)

In [0]:
model = SVC(kernel='rbf', C=1, gamma=2.09, probability=True)

model.fit(X_train, y_train)

model.score(X_val, y_val)

0.8535911602209945

**Create a submission file**

In [0]:
predictions = model.predict(X_test)
pd.DataFrame({'id': ids, 'target': predictions}).to_csv('13. Submission_SVC.csv', index=False)

**Score** 0.82617

## Model #4

#### Universal Sentence Encoder with preprocessed tweets

In [0]:
model_2 = SVC(kernel='rbf', C=1.07, gamma=2.075, probability=True)

model_2.fit(X_train, y_train)

model_2.score(X_val, y_val)

0.850828729281768

**Create a submission file**

In [0]:
predictions_2 = model_2.predict(X_test)
pd.DataFrame({'id': ids, 'target': predictions_2}).to_csv('8. Submission_SVC.csv', index=False)

**Score** 0.82719