In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv('nlp-getting-started/train.csv')

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_df = pd.read_csv('nlp-getting-started/test.csv')

In [5]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
train_df.set_index('id', inplace=True)

In [7]:
test_df.set_index('id', inplace=True)

In [8]:
all_data = pd.concat([train_df, test_df])

In [9]:
all_data

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1.0
4,,,Forest fire near La Ronge Sask. Canada,1.0
5,,,All residents asked to 'shelter in place' are ...,1.0
6,,,"13,000 people receive #wildfires evacuation or...",1.0
7,,,Just got sent this photo from Ruby #Alaska as ...,1.0
...,...,...,...,...
10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,
10865,,,Storm in RI worse than last hurricane. My city...,
10868,,,Green Line derailment in Chicago http://t.co/U...,
10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,


In [10]:
train_df.drop(['keyword', 'location'], axis=1, inplace=True)
test_df.drop(['keyword', 'location'], axis=1, inplace=True)

In [11]:
train_df

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,1
4,Forest fire near La Ronge Sask. Canada,1
5,All residents asked to 'shelter in place' are ...,1
6,"13,000 people receive #wildfires evacuation or...",1
7,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
10869,Two giant cranes holding a bridge collapse int...,1
10870,@aria_ahrary @TheTawniest The out of control w...,1
10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,Police investigating after an e-bike collided ...,1


In [12]:
X_train = train_df['text']
X_test = test_df['text']
y_train = train_df['target']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer()

In [15]:
X_train_vecterized = vectorizer.fit_transform(X_train)

In [16]:
X_test_vecterized = vectorizer.transform(X_test)

In [17]:
X_train_vecterized.shape

(7613, 21637)

In [18]:
X_test_vecterized.shape

(3263, 21637)

In [19]:
from sklearn.svm import SVC

In [20]:
svc = SVC()

In [21]:
svc.fit(X_train_vecterized, y_train)

In [22]:
svc_pred = svc.predict(X_test_vecterized)

In [23]:
svc_pred

array([1, 0, 1, ..., 1, 1, 0])

In [24]:
answers = {'id': test_df.index, 'target': svc_pred}

In [25]:
answers_df = pd.DataFrame(answers)

In [26]:
answers_df

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [27]:
# answers_df.to_csv('answers.csv', sep=',', index=False)

In [28]:
from sklearn.ensemble import GradientBoostingClassifier

In [29]:
gbc = GradientBoostingClassifier()

In [30]:
gbc.fit(X_train_vecterized, y_train)

In [31]:
gbc_pred = gbc.predict(X_test_vecterized)

In [32]:
answers['target'] = gbc_pred

In [33]:
answers_df = pd.DataFrame(answers)

In [34]:
answers_df.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,1


In [35]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [36]:
from sklearn.model_selection import GridSearchCV

In [37]:
param_grid = {
    'C': [1.27, 1.3, 1.32],
    'degree':[1]
}

In [38]:
svc_new = SVC()

In [39]:
grid_search = GridSearchCV(svc_new, param_grid=param_grid, n_jobs=5, cv=5)

In [40]:
grid_search.fit(X_train_vecterized, y_train)

In [41]:
grid_search.best_params_

{'C': 1.3, 'degree': 1}

In [42]:
grid_svc = grid_search.best_estimator_

In [43]:
grid_svc_pred = grid_svc.predict(X_test_vecterized)

In [44]:
answers = {'id': test_df.index, 'target': grid_svc_pred}

In [45]:
answers_df = pd.DataFrame(answers)

In [46]:
answers_df.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [47]:
# answers_df.to_csv('answers_tuned_svm.csv', sep=',', index=False)

Tuned SVM Score == 81.171%

In [48]:
import tensorflow_hub as hub
from tensorflow_text import BertTokenizer

2023-08-24 22:52:14.522627: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-24 22:52:14.523979: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-24 22:52:14.551041: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-24 22:52:14.551642: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [49]:
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4', trainable=True)

2023-08-24 22:52:15.700615: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-24 22:52:15.700929: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [52]:
preprocessor = BertTokenizer(vocab_lookup_table=)

TypeError: BertTokenizer.__init__() missing 1 required positional argument: 'vocab_lookup_table'

In [51]:
del answers, all_data, answers_df, gbc, gbc_pred, svc, svc_new, svc_pred