In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df_train = pd.read_csv("https://raw.githubusercontent.com/DLaux/BSA2020_Team_Tissot_Project_2/master/data/train.csv", 
                          encoding='utf_8', 
                          dtype = 'unicode',
                          parse_dates = True,
                          infer_datetime_format = True,
                          low_memory=False)

df_test = pd.read_csv("https://raw.githubusercontent.com/DLaux/BSA2020_Team_Tissot_Project_2/master/data/test.csv", 
                          encoding='utf_8', 
                          dtype = 'unicode',
                          parse_dates = True,
                          infer_datetime_format = True,
                          low_memory=False)

## Meta features 

In [3]:
import nltk
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')

# word_count
df_train['word_count'] = df_train['text'].apply(lambda x: len(str(x).split()))
df_test['word_count'] = df_test['text'].apply(lambda x: len(str(x).split()))

# mean_word_length
df_train['mean_word_length'] = df_train['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_test['mean_word_length'] = df_test['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# unique_word_count
df_train['unique_word_count'] = df_train['text'].apply(lambda x: len(set(str(x).split())))
df_test['unique_word_count'] = df_test['text'].apply(lambda x: len(set(str(x).split())))

# char_count
df_train['char_count'] = df_train['text'].apply(lambda x: len(str(x)))
df_test['char_count'] = df_test['text'].apply(lambda x: len(str(x)))

# stop_word_count
df_train['stop_word_count'] = df_train['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
df_test['stop_word_count'] = df_test['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

In [4]:
df_train.head(3)

Unnamed: 0,id,keyword,location,text,target,word_count,mean_word_length,unique_word_count,char_count,stop_word_count
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,13,4.384615,13,69,6
1,4,,,Forest fire near La Ronge Sask. Canada,1,7,4.571429,7,38,0
2,5,,,All residents asked to 'shelter in place' are ...,1,22,5.090909,20,133,11


## Prepare data

In [5]:
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn import utils
from sklearn.model_selection import train_test_split

from preprocess_tweets import preprocess_tweet, remove_stopwords

Using TensorFlow backend.


In [6]:
df_train.text = df_train.text.apply(preprocess_tweet).apply(remove_stopwords)
df_test.text  = df_test.text.apply(preprocess_tweet).apply(remove_stopwords)

In [7]:
X = df_train.drop(["target", "id"], axis =1)
y = df_train["target"]

In [8]:
meta = X[["word_count", "mean_word_length", "unique_word_count", "char_count", "stop_word_count"]]

In [9]:
X[["word_count", "mean_word_length", "unique_word_count", "char_count", "stop_word_count"]] = normalize(meta)

In [10]:
X.keyword = X.keyword.astype("str")
X.location = X.location.astype("str")
X.text = X.text.astype("str")

In [11]:
print(X.head(4))
print("-"*70)
print(X.dtypes)

  keyword location                                               text  \
0     nan      nan       deeds reason earthquake may allah forgive us   
1     nan      nan              forest fire near la ronge sask canada   
2     nan      nan  residents asked shelter place notified officer...   
3     nan      nan  <number> people receive wildfires evacuation o...   

   word_count  mean_word_length  unique_word_count  char_count  \
0    0.181076          0.061073           0.181076    0.961097   
1    0.177065          0.115634           0.177065    0.961210   
2    0.160794          0.037209           0.146177    0.972075   
3    0.120539          0.107355           0.120539    0.979382   

   stop_word_count  
0         0.083574  
1         0.000000  
2         0.080397  
3         0.015067  
----------------------------------------------------------------------
keyword               object
location              object
text                  object
word_count           float64
mean_word_l

In [12]:
#encode the target 
lab_enc = LabelEncoder()
encoded_y = lab_enc.fit_transform(y)

In [13]:
list(lab_enc.classes_)

['0', '1']

In [14]:
# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size=0.2, random_state=72, stratify=encoded_y)

Tokenization

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
count_vect_text = CountVectorizer(min_df=2, max_df=0.75, encoding='utf-8')
X_train_text_counts = count_vect_text.fit_transform(X_train.text)


#We don't need min_df and max_df for the rest of the features since they are single words
count_vect_location = CountVectorizer(encoding='utf-8')
X_train_location_counts = count_vect_location.fit_transform(X_train.location.astype("str"))

count_vect_keyword = CountVectorizer(encoding='utf-8')
X_train_keyword_counts = count_vect_keyword.fit_transform(X_train.keyword.astype("str"))

TF IDF

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_text_tfidf = tfidf_transformer.fit_transform(X_train_text_counts)
X_train_location_tfidf = tfidf_transformer.fit_transform(X_train_location_counts)
X_train_keyword_tfidf = tfidf_transformer.fit_transform(X_train_keyword_counts)


## Keyword 

### Model selection

In [18]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

classifiers = [   
    #KNeighborsClassifier(n_neighbors = 10,weights = 'distance',algorithm = 'brute'),
    #SVC(kernel="rbf", C=0.025, probability=True, random_state=42),
    #NuSVC(probability=True, random_state=42),
    DecisionTreeClassifier(random_state=42),
    #RandomForestClassifier(random_state=42),
    #AdaBoostClassifier(random_state=42),
    #GradientBoostingClassifier(random_state=42),
    #MultinomialNB(),
    SGDClassifier(random_state=42),
    MLPClassifier(random_state=42)
    ]

for classifier in classifiers:
    pipe = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('smote', SMOTE(random_state=42)),
                     ('classifier', classifier)
                     ])
    pipe.fit(X_train.keyword, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test.keyword, y_test))
    
    predictions = pipe.predict(X_test.keyword)
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')
model score: 0.733
              precision    recall  f1-score   support

           0       0.77      0.75      0.76       869
           1       0.68      0.71      0.70       654

    accuracy                           0.73      1523
   macro avg       0.73      0.73      0.73      1523
weighted avg       0.73      0.73      0.73      1523

[[652 217]
 [190 464]]
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_i

### hyperparameter optimization

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
keyword_dtc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('dtc', DecisionTreeClassifier(random_state=42))
])

keyword_dtc.fit(X_train.keyword, y_train)
print("model score: %.3f" % keyword_dtc.score(X_test.keyword, y_test))

model score: 0.733


In [21]:
predictions = keyword_dtc.predict(X_test.keyword)
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.77      0.75      0.76       869
           1       0.68      0.71      0.70       654

    accuracy                           0.73      1523
   macro avg       0.73      0.73      0.73      1523
weighted avg       0.73      0.73      0.73      1523

[[652 217]
 [190 464]]


In [22]:
max_depth = [None, 85, 86, 87, 88, 89,90]
min_samples_leaf = [0.001, 0.01, 0.1]
min_samples_split = [2, 3, 4]

parameters = {
    "dtc__max_depth" : max_depth,
    "dtc__min_samples_split" : min_samples_split,
    "dtc__min_samples_leaf" : min_samples_leaf,
}


CV = GridSearchCV(keyword_dtc, parameters, n_jobs= 2)
                  
CV.fit(X_train.keyword, y_train)  
print(CV.best_params_) 
print(CV.best_score_)

{'dtc__max_depth': 85, 'dtc__min_samples_leaf': 0.001, 'dtc__min_samples_split': 2}
0.7254515599343185


## location

### Model selection

In [23]:
classifiers = [   
    #KNeighborsClassifier(),
    SVC(random_state=42),
    #NuSVC(probability=True, random_state=42),
    #DecisionTreeClassifier(random_state=42),
    #RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    #MultinomialNB(),
    #SGDClassifier(random_state=42),
    #MLPClassifier(random_state=42)
    ]

for classifier in classifiers:
    pipe = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('smote', SMOTE(random_state=42)),
                     ('classifier', classifier)
                     ])
    pipe.fit(X_train.location, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test.location, y_test))
    
    predictions = pipe.predict(X_test.location)
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=42,
    shrinking=True, tol=0.001, verbose=False)
model score: 0.579
              precision    recall  f1-score   support

           0       0.58      0.96      0.72       869
           1       0.58      0.07      0.13       654

    accuracy                           0.58      1523
   macro avg       0.58      0.52      0.42      1523
weighted avg       0.58      0.58      0.47      1523

[[836  33]
 [608  46]]
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=42)
model score: 0.581
              precision    recall  f1-score   support

           0       0.58      0.94      0.72       869
           1       0.56      0.11      0.18       654

    accuracy                           0.58      1523
   macro avg       0.

We can note that there is a really low f1-score for a target of 1 across all models

TOP 3:

AdaBoostClassifier | SVC | GradientBoostingClassifier |
--- |--- |--- |
0.581 | 0.579 |0.576 |

### Hyper optimization 

In [24]:
location_abc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('abc', AdaBoostClassifier(random_state=42))
])

location_abc.fit(X_train.location, y_train)
print("model score: %.3f" % location_abc.score(X_test.location, y_test))

model score: 0.581


In [25]:
n_estimators = [279, 290, 291]

parameters = {
    "abc__n_estimators" : n_estimators
}


CV = GridSearchCV(location_abc, parameters,cv = 3, n_jobs= -1)
                  
CV.fit(X_train.location, y_train)  
print(CV.best_params_) 
print(CV.best_score_)

{'abc__n_estimators': 290}
0.5830870279146141


In [26]:
print("model score: %.3f" % CV.score(X_test.location, y_test))
predictions = CV.predict(X_test.location)
print(classification_report(y_test, predictions))

model score: 0.529
              precision    recall  f1-score   support

           0       0.61      0.49      0.54       869
           1       0.46      0.58      0.51       654

    accuracy                           0.53      1523
   macro avg       0.53      0.53      0.53      1523
weighted avg       0.54      0.53      0.53      1523



## Content of the tweet 

### Model selection

In [27]:
classifiers = [   
    #KNeighborsClassifier(n_neighbors = 10,weights = 'distance',algorithm = 'brute'),
    #SVC(kernel="rbf", C=0.025, probability=True, random_state=42),
    NuSVC(probability=True, random_state=42),
    #DecisionTreeClassifier(random_state=42),
    #RandomForestClassifier(random_state=42),
    #AdaBoostClassifier(random_state=42),
    #GradientBoostingClassifier(random_state=42),
    MultinomialNB(),
    SGDClassifier(random_state=42),
    #MLPClassifier(random_state=42)
    ]

for classifier in classifiers:
    pipe = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('smote', SMOTE(random_state=42)),
                     ('classifier', classifier)
                     ])
    pipe.fit(X_train.text, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test.text, y_test))
    
    predictions = pipe.predict(X_test.text)
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

NuSVC(cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, nu=0.5, probability=True, random_state=42,
      shrinking=True, tol=0.001, verbose=False)
model score: 0.777
              precision    recall  f1-score   support

           0       0.75      0.91      0.82       869
           1       0.84      0.60      0.70       654

    accuracy                           0.78      1523
   macro avg       0.79      0.75      0.76      1523
weighted avg       0.79      0.78      0.77      1523

[[792  77]
 [263 391]]
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
model score: 0.798
              precision    recall  f1-score   support

           0       0.81      0.84      0.83       869
           1       0.78      0.74      0.76       654

    accuracy                           0.80      1523
   macro avg       0.79      0.79      0.79      1523
weighted avg       0.80      0.80

### Hyperparameter optimization 

####  MultinomialNB

In [28]:
text_mnb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('mnb', MultinomialNB())
])

text_mnb.fit(X_train.text, y_train)
print("model score: %.3f" % text_mnb.score(X_test.text, y_test))

model score: 0.798


In [29]:
parameters = {
    'mnb__alpha': [1, 1e-1, 1e-2]
}

CV = GridSearchCV(text_mnb, parameters,cv = 3, n_jobs= -1)
                  
CV.fit(X_train.text, y_train)  
print(CV.best_params_) 
print(CV.best_score_)
print("model score: %.3f" % CV.score(X_test.text, y_test))

{'mnb__alpha': 1}
0.7798029556650247
model score: 0.798


#### SGDClassifier

In [30]:
text_sgdc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('sgdc', SGDClassifier(random_state=42))
])

text_sgdc.fit(X_train.text, y_train)
print("model score: %.3f" % text_sgdc.score(X_test.text, y_test))

model score: 0.793


In [31]:
parameters = {
    'sgdc__alpha': [1e-5, 1e-4, 1e-3], # learning rate
    'sgdc__max_iter': [19, 20, 21], # number of epochs
}

CV = GridSearchCV(text_sgdc, parameters,cv = 3, n_jobs= -1)
                  
CV.fit(X_train.text, y_train)  
print(CV.best_params_) 
print(CV.best_score_)

print("model score: %.3f" % CV.score(X_test.text, y_test))

predictions = CV.predict(X_test.text)
print(classification_report(y_test, predictions))

{'sgdc__alpha': 0.001, 'sgdc__max_iter': 19}
0.7908045977011494
model score: 0.776
              precision    recall  f1-score   support

           0       0.76      0.88      0.82       869
           1       0.80      0.64      0.71       654

    accuracy                           0.78      1523
   macro avg       0.78      0.76      0.76      1523
weighted avg       0.78      0.78      0.77      1523



## Meta features

In [32]:
X.head()

Unnamed: 0,keyword,location,text,word_count,mean_word_length,unique_word_count,char_count,stop_word_count
0,,,deeds reason earthquake may allah forgive us,0.181076,0.061073,0.181076,0.961097,0.083574
1,,,forest fire near la ronge sask canada,0.177065,0.115634,0.177065,0.96121,0.0
2,,,residents asked shelter place notified officer...,0.160794,0.037209,0.146177,0.972075,0.080397
3,,,<number> people receive wildfires evacuation o...,0.120539,0.107355,0.120539,0.979382,0.015067
4,,,got sent photo ruby alaska smoke wildfires pou...,0.175684,0.049411,0.164703,0.96626,0.076862


In [33]:
meta_features_train = X_train[["word_count", "unique_word_count", "mean_word_length", "char_count"]]
meta_features_test = X_test[["word_count", "unique_word_count", "mean_word_length", "char_count"]]

In [34]:
classifiers = [   
    #KNeighborsClassifier(n_neighbors = 10,weights = 'distance',algorithm = 'brute'),
    #SVC(kernel="rbf", C=0.025, probability=True, random_state=42),
    #NuSVC(probability=True, random_state=42),
    #DecisionTreeClassifier(random_state=42),
    #RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    #MultinomialNB(),
    #SGDClassifier(random_state=42),
    MLPClassifier(random_state=42)
    ]

for classifier in classifiers:
    pipe = Pipeline([
                     ('smote', SMOTE(random_state=42)),
                     ('classifier', classifier)
                     ])
    pipe.fit(meta_features_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(meta_features_test, y_test))
    
    predictions = pipe.predict(meta_features_test)
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=42)
model score: 0.624
              precision    recall  f1-score   support

           0       0.72      0.55      0.63       869
           1       0.55      0.72      0.62       654

    accuracy                           0.62      1523
   macro avg       0.64      0.64      0.62      1523
weighted avg       0.65      0.62      0.62      1523

[[480 389]
 [183 471]]
GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
   

In [35]:
meta_abc = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('abc', AdaBoostClassifier(random_state=42))
])

meta_abc.fit(meta_features_train, y_train)
print("model score: %.3f" % meta_abc.score(meta_features_test, y_test))

model score: 0.624


In [36]:
parameters = {
    "abc__n_estimators" : [45, 50, 55]
}

CV = GridSearchCV(meta_abc, parameters,cv = 3, n_jobs= -1)
                  
CV.fit(meta_features_train, y_train)  
print(CV.best_params_) 
print(CV.best_score_)

print("model score: %.3f" % CV.score(meta_features_test, y_test))

predictions = CV.predict(meta_features_test)
print(classification_report(y_test, predictions))

{'abc__n_estimators': 45}
0.6274220032840723
model score: 0.622
              precision    recall  f1-score   support

           0       0.72      0.56      0.63       869
           1       0.55      0.71      0.62       654

    accuracy                           0.62      1523
   macro avg       0.63      0.63      0.62      1523
weighted avg       0.64      0.62      0.62      1523



n_estimators = 50 is the default value and the best when using gridsearch, so we don't change the model

## Combine all models into a meta model

In [37]:
def add_predictions(X_input):
    #supress a warning
    pd.options.mode.chained_assignment = None  # default='warn'
    
    text_pred = text_mnb.predict_proba(X_input.text)
    location_pred = location_abc.predict_proba(X_input.location)
    keyword_pred = keyword_dtc.predict_proba(X_input.keyword)
    meta_features_pred = meta_abc.predict_proba(X_input[["word_count", "unique_word_count", "mean_word_length", "char_count"]])

    X_input['text_pred'] = text_pred[:,0]
    X_input['location_pred'] = location_pred[:,0]
    X_input['keyword_pred'] = keyword_pred[:,0] 
    X_input['meta_features_pred'] = meta_features_pred[:,0] 
    
    
    return True

In [38]:
add_predictions(X_train)
add_predictions(X_test)

True

In [39]:
X_test.head(5)

Unnamed: 0,keyword,location,text,word_count,mean_word_length,unique_word_count,char_count,stop_word_count,text_pred,location_pred,keyword_pred,meta_features_pred
1457,casualty,"Massachusetts, USA",japan nuke program albeit unsuccessful casualt...,0.160885,0.037229,0.146259,0.97262,0.073129,0.212221,0.500286,0.6,0.501551
2746,devastation,,utter shock devastation not go work left feeli...,0.205398,0.030235,0.197182,0.953046,0.098591,0.796248,0.500286,0.171429,0.508251
4727,lava,,liked video <url> minecraft postscript <number...,0.183202,0.031657,0.168546,0.967308,0.03664,0.879254,0.500286,0.826087,0.503712
847,blizzard,Ideally under a big tree,horrible moment yoyou open yoyoup dryer looks ...,0.203032,0.027192,0.188529,0.957149,0.079762,0.689005,0.500286,0.862069,0.507862
2896,drown,new york,drown cannot swim <url>,0.153079,0.092603,0.153079,0.9695,0.068035,0.763054,0.503644,0.892857,0.502759


### model selection

In [40]:
classifiers = [   
    #KNeighborsClassifier(n_neighbors = 10,weights = 'distance',algorithm = 'brute'),
    SVC(kernel="rbf", C=0.025, probability=True, random_state=42),
    NuSVC(probability=True, random_state=42),
    #DecisionTreeClassifier(random_state=42),
    #RandomForestClassifier(random_state=42),
    #AdaBoostClassifier(random_state=42),
    #GradientBoostingClassifier(random_state=42),
    MultinomialNB(),
    SGDClassifier(random_state=42),
    MLPClassifier(random_state=42)
    ]

for classifier in classifiers:
    pipe = Pipeline([
                     ('smote', SMOTE(random_state=42)),
                     ('classifier', classifier)
                     ])
    pipe.fit(X_train[["text_pred", "location_pred", "keyword_pred", "meta_features_pred"]], y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test[["text_pred", "location_pred", "keyword_pred", "meta_features_pred"]], y_test))
    
    predictions = pipe.predict(X_test[["text_pred", "location_pred", "keyword_pred", "meta_features_pred"]])
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=True, random_state=42,
    shrinking=True, tol=0.001, verbose=False)
model score: 0.800
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       869
           1       0.80      0.72      0.76       654

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523

[[749 120]
 [184 470]]
NuSVC(cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, nu=0.5, probability=True, random_state=42,
      shrinking=True, tol=0.001, verbose=False)
model score: 0.794
              precision    recall  f1-score   support

           0       0.79      0.86      0.83       869
           1  

NuSVC : 0.806
SVC : 0.800
MultinomialNB : 0.798

### Hyperparameter optimization

In [47]:
meta_mnb = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('mnb', MultinomialNB())
])

meta_mnb.fit(X_train[["text_pred", "location_pred", "keyword_pred", "meta_features_pred"]], y_train)
print("model score: %.3f" % meta_mnb.score(X_test[["text_pred", "location_pred", "keyword_pred", "meta_features_pred"]], y_test))

predictions = meta_mnb.predict(X_test[["text_pred", "location_pred", "keyword_pred", "meta_features_pred"]])
print(classification_report(y_test, predictions))

model score: 0.805
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       869
           1       0.84      0.68      0.75       654

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.81      0.80      0.80      1523



In [48]:
parameters = {
    "mnb__alpha" : [1, 0.9, 0.5]    
}

CV = GridSearchCV(meta_mnb, parameters, cv = 3, n_jobs= -1)
                  
CV.fit(X_train[["text_pred", "location_pred", "keyword_pred", "meta_features_pred"]], y_train)  
print(CV.best_params_) 
print(CV.best_score_)

{'mnb__alpha': 1}
0.8875205254515599


In [55]:
meta_mnb = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('mnb', MultinomialNB(alpha = 1))
     ])

meta_mnb.fit(X_train[["text_pred", "location_pred", "keyword_pred", "meta_features_pred"]], y_train)
print("model score: %.3f" % meta_mnb.score(X_test[["text_pred", "location_pred", "keyword_pred", "meta_features_pred"]], y_test))

predictions = meta_mnb.predict(X_test[["text_pred", "location_pred", "keyword_pred", "meta_features_pred"]])
print(classification_report(y_test, predictions))

model score: 0.805
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       869
           1       0.84      0.68      0.75       654

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.81      0.80      0.80      1523

