<a href="https://colab.research.google.com/github/mkrsteska/BSA2020_Team_Tissot_Project_2/blob/master/code/Universal%20Sentence%20Encoder%20and%20SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**In this notebook we have the model that achieved the best accuracy on Kaggle.**

**The other models we have tried are in separate notebooks, that can be found in the GitHub repository.**

### GitHub repository ###
[GitHub](https://github.com/mkrsteska/BSA2020_Team_Tissot_Project_2)

### Link to our video ###
[Video](https://www.youtube.com/watch?v=WDVuDgq4BTc)

### Kaggle Ranking ###
![Kaggle ranking](https://raw.githubusercontent.com/mkrsteska/BSA2020_Team_Tissot_Project_2/master/data/Kaggle_ranking.png)


In [2]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from preprocess_tweets import preprocess_tweet_use
from sklearn.model_selection import GridSearchCV

Using TensorFlow backend.


In [0]:
# Read the data 
df_train = pd.read_csv("https://raw.githubusercontent.com/mkrsteska/BSA2020_Team_Tissot_Project_2/master/data/train.csv")
df_train = df_train[["text", "target"]]
df_test = pd.read_csv("https://raw.githubusercontent.com/mkrsteska/BSA2020_Team_Tissot_Project_2/master/data/test.csv")

### Universal Sentence Encoder 


In [0]:
# Load Universal Sentence Encoder
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_train = []
for r in tqdm(df_train.text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(text_emb)
  
X_train = np.array(X_train)
y_train = df_train.target.values

100%|██████████| 7613/7613 [03:00<00:00, 42.28it/s]


In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_test = []
for r in tqdm(df_test.text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(text_emb)

X_test = np.array(X_test)

100%|██████████| 3263/3263 [01:14<00:00, 43.89it/s]


In [0]:
# Parameter estimation using grid search with cross-validation

Cs = [1, 1.01, 1.03, 1.05, 1.07, 1.09, 1.1, 1.3, 1.5, 1.7, 1.9, 2]
#Cs = [1.070, 1.074, 1.075, 1.1, 1.125]
gammas = [2.01, 2.03, 2.05, 2.07, 2.09]
#gammas = [2.065, 2.075, 2.08]
param_grid = {'C': Cs, 'gamma' : gammas}
grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv = 5, n_jobs=8)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'C': 1, 'gamma': 2.09}

In [0]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=37)

In [0]:
model_3 = SVC(kernel='rbf', C=1, gamma=2.09, probability=True)

model_3.fit(X_train, y_train)

model_3.score(X_val, y_val)

0.8530183727034121

**Create submission files**

In [0]:
ids = df_test['id'].to_numpy()

In [0]:
predictions_3 = model_3.predict(X_test)
pd.DataFrame({'id': ids, 'target': predictions_3}).to_csv('14. Submission_SVC.csv', index=False)

### Correct Mislabeled Samples

In [0]:
df_mislabeled = df_train.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']

In [0]:
df_train.loc[df_train['text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'To fight bioterrorism sir.', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target_relabeled'] = 1
df_train.loc[df_train['text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target_relabeled'] = 1
df_train.loc[df_train['text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target_relabeled'] = 0
df_train.loc[df_train['text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target_relabeled'] = 1
df_train.loc[df_train['text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target_relabeled'] = 1
df_train.loc[df_train['text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target_relabeled'] = 0
df_train.loc[df_train['text'] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", 'target_relabeled'] = 0
df_train.loc[df_train['text'] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam", 'target_relabeled'] = 0
df_train.loc[df_train['text'] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam!", 'target_relabeled'] = 0
df_train.loc[df_train['text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target_relabeled'] = 0
df_train.loc[df_train['text'] == "Caution: breathing may be hazardous to your health.", 'target_relabeled'] = 1
df_train.loc[df_train['text'] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????", 'target_relabeled'] = 0
df_train.loc[df_train['text'] == "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect", 'target_relabeled'] = 0
df_train.loc[df_train['text'] == "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time", 'target_relabeled'] = 0

In [9]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_train = []
for r in tqdm(df_train.text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(text_emb)
  
X_train = np.array(X_train)
y_train = df_train.target.values

100%|██████████| 7613/7613 [03:00<00:00, 42.28it/s]


In [10]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_test = []
for r in tqdm(df_test.text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(text_emb)

X_test = np.array(X_test)

100%|██████████| 3263/3263 [01:14<00:00, 43.89it/s]


In [0]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=37)

In [12]:
# After correcting the mislabeled samples
model_4 = SVC(kernel='rbf', C=1, gamma=2.09, probability=True)

model_4.fit(X_train, y_train)

model_4.score(X_val, y_val)

0.8530183727034121

**Create a submission file**

In [0]:
predictions_4 = model_4.predict(X_test)
pd.DataFrame({'id': ids, 'target': predictions_4}).to_csv('15. Submission_SVC.csv', index=False)

### Universal Sentence Encoder 
with preprocessed tweets

In [0]:
train_text = df_train.text.apply(preprocess_tweet_use)
test_text = df_test.text.apply(preprocess_tweet_use)
y_train = df_train.target

In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_train = []
for r in tqdm(df.text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(text_emb)
X_train = np.array(X_train)
y_train = df.target.values

100%|██████████| 7613/7613 [02:13<00:00, 57.17it/s]


In [0]:
# Encoding tweets into embedding vectors using universal sentence encoder

X_test = []
for r in tqdm(df_test.text.values):
  emb = use([r])
  text_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(text_emb)

X_test = np.array(X_test)

100%|██████████| 3263/3263 [00:55<00:00, 58.81it/s]


In [0]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=37)

In [0]:
model = SVC(kernel='rbf', C=1, gamma=2.09, probability=True)

model.fit(X_train, y_train)

model.score(X_val, y_val)

0.8535911602209945

In [0]:
model_2 = SVC(kernel='rbf', C=1.07, gamma=2.075, probability=True)

model_2.fit(X_train, y_train)

model_2.score(X_val, y_val)

0.850828729281768

**Create a submission file**

In [0]:
predictions = model.predict(X_test)
pd.DataFrame({'id': ids, 'target': predictions}).to_csv('13. Submission_SVC.csv', index=False)

In [0]:
predictions_2 = model_2.predict(X_test)
pd.DataFrame({'id': ids, 'target': predictions_2}).to_csv('8. Submission_SVC.csv', index=False)