In [1]:
!pip install --user tensorflow_text

Collecting tensorflow_text
[?25l  Downloading https://files.pythonhosted.org/packages/78/e7/d260e51d44bea241e8eee39d0266df9b33a3d6219ded118a1c81a872e848/tensorflow_text-2.2.0-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 3.4MB/s eta 0:00:01
[?25hCollecting tensorflow<2.3,>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/3d/be/679ce5254a8c8d07470efb4a4c00345fae91f766e64f1c2aece8796d7218/tensorflow-2.2.0-cp36-cp36m-manylinux2010_x86_64.whl (516.2MB)
[K     |████████████████████████████████| 516.2MB 20kB/s s eta 0:00:01     |██████████████████████████▍     | 425.2MB 46.7MB/s eta 0:00:02
Collecting tensorflow-estimator<2.3.0,>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f5/926ae53d6a226ec0fda5208e0e581cffed895ccc89e36ba76a8e60895b78/tensorflow_estimator-2.2.0-py2.py3-none-any.whl (454kB)
[K     |████████████████████████████████| 460kB 40.8MB/s eta 0:00:01
Collecting h5py<2.11.0,>=2.10.0
[?25l  Downl

Successfully installed astunparse-1.6.3 gast-0.3.3 h5py-2.10.0 scipy-1.4.1 tensorboard-2.2.1 tensorboard-plugin-wit-1.6.0.post3 tensorflow-2.2.0 tensorflow-estimator-2.2.0 tensorflow-text-2.2.0


In [2]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

from tqdm import tqdm
import numpy as np
import pandas as pd
import re

## Loadind datasets.

In [3]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

## Data cleaning.

In [4]:
def clean(text):
    text = re.sub(r"http\S+", " ", text) # remove urls
    text = re.sub(r"RT ", " ", text) # remove rt
    text = re.sub(r"[^a-zA-Z\'\.\,\d\s]", " ", text) # remove special character except # @ . ,
    text = re.sub(r"[0-9]", " ", text) # remove number
    text = re.sub(r'\t', ' ', text) # remove tabs
    text = re.sub(r'\n', ' ', text) # remove line jump
    text = re.sub(r"\s+", " ", text) # remove extra white space
    text = text.strip()
    return text

In [5]:
train.text = train.text.apply(clean)
test.text = test.text.apply(clean)

## Loading universal sentences encoder.

In [6]:
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

## Sentences embedding.

In [7]:
X_train = []
for r in tqdm(train.text.values):
  emb = use(r)
  review_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(review_emb)

X_train = np.array(X_train)
y_train = train.target.values

X_test = []
for r in tqdm(test.text.values):
  emb = use(r)
  review_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(review_emb)

X_test = np.array(X_test)

100%|██████████| 7613/7613 [04:55<00:00, 25.79it/s]
100%|██████████| 3263/3263 [02:06<00:00, 25.82it/s]


## Sampling over data train.

In [8]:
train_arrays, test_arrays, train_labels, test_labels = train_test_split(X_train,y_train,test_size=0.05)

## Training svm model.

In [9]:
def svc_param_selection(X, y, nfolds):
    #Cs = [0.001, 0.01, 0.1, 1, 10]
    Cs = [1.070, 1.074, 1.075, 1.1, 1.125]
    #gammas = [0.001, 0.01, 0.1, 1]
    gammas = [2.065,2.075, 2.08]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs=8)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search

model = svc_param_selection(train_arrays,train_labels, 5)

## Best parameters:

In [10]:
model.best_params_

{'C': 1.07, 'gamma': 2.075}

## Predictions over valuation data.

In [11]:
pred = model.predict(test_arrays)

## Accuracy

In [12]:
cm = confusion_matrix(test_labels,pred)
cm

array([[208,  12],
       [ 39, 122]])

In [13]:
accuracy = accuracy_score(test_labels,pred)
accuracy

0.8661417322834646

## Making submission

In [17]:
test_pred = model.predict(X_test)
submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)