In [5]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report

In [7]:
tweets = pd.read_csv('processed_train.csv')
test = pd.read_csv('processed_test.csv')
train_features = pd.read_csv('train_complete.csv')
test_features = pd.read_csv('test_complete.csv')

In [8]:
x_train, x_test, y_train, y_test = \
train_test_split(tweets[['text', 'keyword']], tweets['target'], test_size = 0.25, random_state = 123)

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,2), lowercase=True, stop_words='english', max_features=5000)
train_vectors = vectorizer.fit_transform(x_train['text'])
test_vectors = vectorizer.transform(x_test['text'])

### LIGTH GBM

In [38]:
import lightgbm as lgb
matrix_final = train_vectors.astype('float32')
d_train = lgb.Dataset(matrix_final, label=y_train)

params = {
    'learning_rate' : 0.02,
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'metric' : 'binary_logloss',
    'num_leaves' : 50,
    'max_depth' : 3
}

gbm = lgb.train(params, d_train, 5000)

In [14]:
test_final = test_vectors.astype('float32')
y_pred = gbm.predict(test_final)

for i in range (0, len(y_pred)):
    if y_pred[i] >= 0.5:       
        y_pred[i] = 1
    else:  
        y_pred[i] = 0

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
print(accuracy)

0.7880580957504034


In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.87      0.82      1055
           1       0.80      0.69      0.74       804

    accuracy                           0.79      1859
   macro avg       0.79      0.78      0.78      1859
weighted avg       0.79      0.79      0.79      1859



### Random Search

In [20]:
from sklearn.model_selection import RandomizedSearchCV

def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       do_probabilities = False):
    
    gs = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(X_test_data)
    else:
      pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [21]:
model = lgb.LGBMClassifier()

param_grid = {
    'n_estimators': [400, 700, 1000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [15,20,25],
    'num_leaves': [50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}

model, pred = algorithm_pipeline(matrix_final, test_final, y_train, y_test, model, param_grid, cv=5, scoring_fit='accuracy')

print(model.best_score_)
print(model.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.1s finished


0.7522869955156951
{'subsample_freq': 20, 'subsample': 0.9, 'reg_lambda': 1.3, 'reg_alpha': 1.2, 'num_leaves': 200, 'n_estimators': 1000, 'min_split_gain': 0.4, 'max_depth': 15, 'colsample_bytree': 0.7}


### With features

In [30]:
all_data = tweets.merge(train_features)
all_data.drop(columns=['keyword', 'location'], inplace=True)
all_data.head(1)

Unnamed: 0,id,text,target,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,...,l90,l91,l92,l93,l94,l95,l96,l97,l98,l99
0,1,our deeds are the reason of this earthquake m...,1,69,4.384615,13,13,0.2732,6,1,...,-1.2123,0.51573,0.16573,0.67943,0.35327,0.17672,0.25803,0.068445,-1.2016,-0.20168


In [31]:
x_train, x_test, y_train, y_test = \
train_test_split(all_data, tweets['target'], test_size = 0.25, random_state = 123)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,2), lowercase=True, stop_words='english', max_features=5000)
train_vectors = vectorizer.fit_transform(x_train['text'])
test_vectors = vectorizer.transform(x_test['text'])

In [35]:
train_array = train_vectors.todense()
train_matrix = pd.DataFrame(train_array)
x_train.reset_index(inplace=True, drop=True)
train_matrix['id'] = x_train['id']
X_train = x_train.merge(train_matrix).drop(columns=['id', 'text', 'target'])

test_array = test_vectors.todense()
test_matrix = pd.DataFrame(test_array)
x_test.reset_index(inplace=True, drop=True)
test_matrix['id'] = x_test['id']
X_test = x_test.merge(test_matrix).drop(columns=['id', 'text', 'target'])

In [36]:
d_train = lgb.Dataset(X_train, y_train)

params : {
    'learning_rate' : 0.02,
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'metric' : 'binary_logloss',
    'num_leaves' : 500,
    'max_depth' : 2,
    'max_bin': 1000
}

gbm = lgb.train(params, d_train, 10000)

In [37]:
y_pred = gbm.predict(X_test)

for i in range (0, len(y_pred)):
    if y_pred[i] > 0.5:       
        y_pred[i] = 1
    else:  
        y_pred[i] = 0

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
print(accuracy)

0.7945131791285638


### Random search

In [40]:
model = lgb.LGBMClassifier()

param_grid = {
    'n_estimators': [400, 700, 1000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [2, 5, 7],
    'num_leaves': [50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}

model, pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, param_grid, cv=5, scoring_fit='accuracy')

print(model.best_score_)
print(model.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.3min finished


0.7800896860986548
{'subsample_freq': 20, 'subsample': 0.9, 'reg_lambda': 1.1, 'reg_alpha': 1.3, 'num_leaves': 100, 'n_estimators': 1000, 'min_split_gain': 0.4, 'max_depth': 2, 'colsample_bytree': 0.8}
