In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df_train = pd.read_csv("https://raw.githubusercontent.com/DLaux/BSA2020_Team_Tissot_Project_2/master/data/train.csv", 
                          encoding='utf_8', 
                          dtype = 'unicode',
                          parse_dates = True,
                          infer_datetime_format = True,
                          low_memory=False)

df_test = pd.read_csv("https://raw.githubusercontent.com/DLaux/BSA2020_Team_Tissot_Project_2/master/data/test.csv", 
                          encoding='utf_8', 
                          dtype = 'unicode',
                          parse_dates = True,
                          infer_datetime_format = True,
                          low_memory=False)

## Prepare data

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn import utils
from sklearn.model_selection import train_test_split

from preprocess_tweets import preprocess_tweet, remove_stopwords

In [4]:
df_train.text = df_train.text.apply(preprocess_tweet).apply(remove_stopwords)

In [5]:
X = df_train.drop(["target", "id"], axis =1)
y = df_train["target"]

In [6]:
X.keyword = X.keyword.astype("str")
X.location = X.location.astype("str")
X.text = X.text.astype("str")

In [7]:
print(X.head(4))
print("-"*70)
print(X.dtypes)

  keyword location                                               text
0     nan      nan      deeds reason #earthquake may allah forgive us
1     nan      nan             forest fire near la ronge sask. canada
2     nan      nan  residents asked shelter place notified officer...
3     nan      nan  <number> people receive #wildfires evacuation ...
----------------------------------------------------------------------
keyword     object
location    object
text        object
dtype: object


In [8]:
#encode the target 
lab_enc = LabelEncoder()
encoded_y = lab_enc.fit_transform(y)

In [9]:
list(lab_enc.classes_)

['0', '1']

In [10]:
# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size=0.2, random_state=72, stratify=encoded_y)

Tokenization

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
count_vect_text = CountVectorizer(min_df=2, max_df=0.75, encoding='utf-8')
X_train_text_counts = count_vect_text.fit_transform(X_train.text)


#We don't need min_df and max_df for the rest of the features
count_vect_location = CountVectorizer(encoding='utf-8')
X_train_location_counts = count_vect_location.fit_transform(X_train.location.astype("str"))

count_vect_keyword = CountVectorizer(encoding='utf-8')
X_train_keyword_counts = count_vect_keyword.fit_transform(X_train.keyword.astype("str"))

TF IDF

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_text_tfidf = tfidf_transformer.fit_transform(X_train_text_counts)
X_train_location_tfidf = tfidf_transformer.fit_transform(X_train_location_counts)
X_train_keyword_tfidf = tfidf_transformer.fit_transform(X_train_keyword_counts)


## Keyword 

### Model selection

In [24]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

classifiers = [   
    #KNeighborsClassifier(n_neighbors = 10,weights = 'distance',algorithm = 'brute'),
    #SVC(kernel="rbf", C=0.025, probability=True, random_state=42),
    #NuSVC(probability=True, random_state=42),
    DecisionTreeClassifier(random_state=42),
    #RandomForestClassifier(random_state=42),
    #AdaBoostClassifier(random_state=42),
    #GradientBoostingClassifier(random_state=42),
    #MultinomialNB(),
    SGDClassifier(random_state=42),
    MLPClassifier(random_state=42)
    ]

for classifier in classifiers:
    pipe = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('smote', SMOTE(random_state=42)),
                     ('classifier', classifier)
                     ])
    pipe.fit(X_train.keyword, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test.keyword, y_test))
    
    predictions = pipe.predict(X_test.keyword)
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')
model score: 0.733
              precision    recall  f1-score   support

           0       0.77      0.75      0.76       869
           1       0.68      0.71      0.70       654

    accuracy                           0.73      1523
   macro avg       0.73      0.73      0.73      1523
weighted avg       0.73      0.73      0.73      1523

[[652 217]
 [190 464]]
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_i

### hyperparameter optimization

In [28]:
from sklearn.model_selection import GridSearchCV

In [91]:
keyword_dtc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('dtc', DecisionTreeClassifier(random_state=42))
])

keyword_dtc.fit(X_train.keyword, y_train)
print("model score: %.3f" % keyword_dtc.score(X_test.keyword, y_test))

model score: 0.733


In [46]:
predictions = keyword_dtc.predict(X_test.keyword)
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.77      0.75      0.76       869
           1       0.68      0.71      0.70       654

    accuracy                           0.73      1523
   macro avg       0.73      0.73      0.73      1523
weighted avg       0.73      0.73      0.73      1523

[[652 217]
 [190 464]]


In [93]:
max_depth = [None, 85, 86, 87, 88, 89,90]
min_samples_leaf = [None, 1, 2]
min_samples_split = [2, 3, 4]

parameters = {
    "dtc__max_depth" : max_depth,
    "dtc__min_samples_split" : min_samples_split,
    "dtc__min_samples_leaf" : min_samples_leafs,
}


CV = GridSearchCV(keyword_dtc, parameters, n_jobs= 2)
                  
CV.fit(X_train.keyword, y_train)  
print(CV.best_params_) 
print(CV.best_score_)

{'dtc__max_depth': 85, 'dtc__min_samples_leaf': 1, 'dtc__min_samples_split': 2}
0.7254515599343185


## location

### Model selection

In [100]:
classifiers = [   
    #KNeighborsClassifier(),
    SVC(random_state=42),
    #NuSVC(probability=True, random_state=42),
    #DecisionTreeClassifier(random_state=42),
    #RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    #MultinomialNB(),
    #SGDClassifier(random_state=42),
    #MLPClassifier(random_state=42)
    ]

for classifier in classifiers:
    pipe = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('smote', SMOTE(random_state=42)),
                     ('classifier', classifier)
                     ])
    pipe.fit(X_train.location, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test.location, y_test))
    
    predictions = pipe.predict(X_test.location)
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=42,
    shrinking=True, tol=0.001, verbose=False)
model score: 0.579
              precision    recall  f1-score   support

           0       0.58      0.96      0.72       869
           1       0.58      0.07      0.13       654

    accuracy                           0.58      1523
   macro avg       0.58      0.52      0.42      1523
weighted avg       0.58      0.58      0.47      1523

[[836  33]
 [608  46]]
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=42)
model score: 0.581
              precision    recall  f1-score   support

           0       0.58      0.94      0.72       869
           1       0.56      0.11      0.18       654

    accuracy                           0.58      1523
   macro avg       0.

We can note that there is a really low f1-score for a target of 1 across all models

TOP 3:

AdaBoostClassifier | SVC | GradientBoostingClassifier |
--- |--- |--- |
0.581 | 0.579 |0.576 |

### Hyper optimization 

In [101]:
location_abc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('abc', AdaBoostClassifier(random_state=42))
])

location_abc.fit(X_train.location, y_train)
print("model score: %.3f" % location_abc.score(X_test.location, y_test))

model score: 0.581


In [110]:
n_estimators = [279, 290, 291]

parameters = {
    "abc__n_estimators" : n_estimators
}


CV = GridSearchCV(location_abc, parameters,cv = 3, n_jobs= -1)
                  
CV.fit(X_train.location, y_train)  
print(CV.best_params_) 
print(CV.best_score_)

{'abc__n_estimators': 290}
0.5830870279146141


In [154]:
print("model score: %.3f" % CV.score(X_test.location, y_test))
predictions = CV.predict(X_test.location)
print(classification_report(y_test, predictions))

model score: 0.578
              precision    recall  f1-score   support

           0       0.58      0.97      0.72       869
           1       0.59      0.06      0.11       654

    accuracy                           0.58      1523
   macro avg       0.59      0.51      0.42      1523
weighted avg       0.58      0.58      0.46      1523



## Content of the tweet 

### Model selection

In [112]:
classifiers = [   
    #KNeighborsClassifier(n_neighbors = 10,weights = 'distance',algorithm = 'brute'),
    #SVC(kernel="rbf", C=0.025, probability=True, random_state=42),
    NuSVC(probability=True, random_state=42),
    #DecisionTreeClassifier(random_state=42),
    #RandomForestClassifier(random_state=42),
    #AdaBoostClassifier(random_state=42),
    #GradientBoostingClassifier(random_state=42),
    MultinomialNB(),
    SGDClassifier(random_state=42),
    #MLPClassifier(random_state=42)
    ]

for classifier in classifiers:
    pipe = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('smote', SMOTE(random_state=42)),
                     ('classifier', classifier)
                     ])
    pipe.fit(X_train.text, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test.text, y_test))
    
    predictions = pipe.predict(X_test.text)
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='distance')
model score: 0.564
              precision    recall  f1-score   support

           0       0.86      0.28      0.43       869
           1       0.50      0.94      0.65       654

    accuracy                           0.56      1523
   macro avg       0.68      0.61      0.54      1523
weighted avg       0.70      0.56      0.52      1523

[[246 623]
 [ 41 613]]
SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=True, random_state=42,
    shrinking=True, tol=0.001, verbose=False)
model score: 0.684
              precision    recall  f1-score   support

           0       0.66      0.94      0.77       869
           1       0.81      0.34      0.48       654

    accuracy     

### Hyperparameter optimization 

####  MultinomialNB

In [114]:
text_mnb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('mnb', MultinomialNB())
])

text_mnb.fit(X_train.text, y_train)
print("model score: %.3f" % text_mnb.score(X_test.text, y_test))

model score: 0.795


In [122]:
parameters = {
    'mnb__alpha': [1, 1e-1, 1e-2]
}

CV = GridSearchCV(text_mnb, parameters,cv = 3, n_jobs= -1)
                  
CV.fit(X_train.text, y_train)  
print(CV.best_params_) 
print(CV.best_score_)
print("model score: %.3f" % CV.score(X_test.text, y_test))

{'mnb__alpha': 1}
0.7796387520525452
model score: 0.795


#### SGDClassifier

In [125]:
text_sgdc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=42)),
    ('sgdc', SGDClassifier(random_state=42))
])

text_sgdc.fit(X_train.text, y_train)
print("model score: %.3f" % text_sgdc.score(X_test.text, y_test))

model score: 0.790


In [150]:
parameters = {
    'sgdc__alpha': [1e-5, 1e-4, 1e-3], # learning rate
    'sgdc__max_iter': [19, 20, 21], # number of epochs
}

CV = GridSearchCV(text_sgdc, parameters,cv = 3, n_jobs= -1)
                  
CV.fit(X_train.text, y_train)  
print(CV.best_params_) 
print(CV.best_score_)

print("model score: %.3f" % CV.score(X_test.text, y_test))

predictions = CV.predict(X_test.text)
print(classification_report(y_test, predictions))

{'sgdc__alpha': 0.001, 'sgdc__max_iter': 19}
0.7876847290640394
model score: 0.779
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       869
           1       0.81      0.64      0.71       654

    accuracy                           0.78      1523
   macro avg       0.79      0.76      0.77      1523
weighted avg       0.78      0.78      0.77      1523



## Combine all models into a meta model

In [205]:
def add_predictions(X_input):
    #supress a warning
    pd.options.mode.chained_assignment = None  # default='warn'
    
    text_pred = text_mnb.predict_proba(X_input.text)
    location_pred = location_abc.predict_proba(X_input.location)
    keyword_pred = keyword_dtc.predict_proba(X_input.keyword)

    X_input['text_pred'] = text_pred[:,0]
    X_input['location_pred'] = location_pred[:,0]
    X_input['keyword_pred'] = keyword_pred[:,0] 
    
    return True

In [206]:
add_predictions(X_train)
add_predictions(X_test)

True

In [209]:
X_test.head(5)

Unnamed: 0,keyword,location,text,text_pred,location_pred,keyword_pred
1457,casualty,"Massachusetts, USA",japan nuke program (albeit unsuccessful) casua...,0.204679,0.500286,0.394366
2746,devastation,,utter shock devastation not go work left feeli...,0.794757,0.500286,0.394366
4727,lava,,liked video <number> gaming <url> minecraft ps...,0.873592,0.500286,0.394366
847,blizzard,Ideally under a big tree,horrible moment open dryer looks like snowy bl...,0.542082,0.500286,0.394366
2896,drown,new york,drown cannot swim <url>,0.769428,0.503644,0.394366


### model selection

In [217]:
classifiers = [   
    #KNeighborsClassifier(n_neighbors = 10,weights = 'distance',algorithm = 'brute'),
    SVC(kernel="rbf", C=0.025, probability=True, random_state=42),
    NuSVC(probability=True, random_state=42),
    #DecisionTreeClassifier(random_state=42),
    #RandomForestClassifier(random_state=42),
    #AdaBoostClassifier(random_state=42),
    #GradientBoostingClassifier(random_state=42),
    MultinomialNB(),
    SGDClassifier(random_state=42),
    MLPClassifier(random_state=42)
    ]

for classifier in classifiers:
    pipe = Pipeline([
                     ('smote', SMOTE(random_state=42)),
                     ('classifier', classifier)
                     ])
    pipe.fit(X_train[["text_pred", "location_pred", "keyword_pred"]], y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test[["text_pred", "location_pred", "keyword_pred"]], y_test))
    
    predictions = pipe.predict(X_test[["text_pred", "location_pred", "keyword_pred"]])
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='distance')
model score: 0.774
              precision    recall  f1-score   support

           0       0.80      0.81      0.80       869
           1       0.74      0.72      0.73       654

    accuracy                           0.77      1523
   macro avg       0.77      0.77      0.77      1523
weighted avg       0.77      0.77      0.77      1523

[[705 164]
 [180 474]]
SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=True, random_state=42,
    shrinking=True, tol=0.001, verbose=False)
model score: 0.800
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       869
           1       0.79      0.72      0.76       654

    accuracy     

NuSVC : 0.806
SVC : 0.800
MultinomialNB : 0.798

### Hyperparameter optimization

In [223]:
meta_rfc = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('nusvc', NuSVC(random_state=42))
])

meta_rfc.fit(X_train[["text_pred", "location_pred", "keyword_pred"]], y_train)
print("model score: %.3f" % meta_rfc.score(X_test[["text_pred", "location_pred", "keyword_pred"]], y_test))

predictions = meta_rfc.predict(X_test[["text_pred", "location_pred", "keyword_pred"]])
print(classification_report(y_test, predictions))

model score: 0.806
              precision    recall  f1-score   support

           0       0.80      0.87      0.84       869
           1       0.81      0.72      0.76       654

    accuracy                           0.81      1523
   macro avg       0.81      0.80      0.80      1523
weighted avg       0.81      0.81      0.80      1523



In [239]:
kernel = ["linear", "poly", "rbf", "sigmoid"]
class_weight = [{0: w} for w in [0.1, 0.2, 0.4, 0.6, 0.7, 0.8, 0.9 ,1]]


parameters = {
    "nusvc__kernel" : kernel,
    'nusvc__max_iter':[-1, 10, 20],
    'nusvc__class_weight': class_weight,
    
}

CV = GridSearchCV(meta_rfc, parameters, cv = 3, n_jobs= -1)
                  
CV.fit(X_train[["text_pred", "location_pred", "keyword_pred"]], y_train)  
print(CV.best_params_) 
print(CV.best_score_)

{'nusvc__class_weight': {0: 0.1}, 'nusvc__kernel': 'rbf', 'nusvc__max_iter': -1}
0.9011494252873563


In [249]:
meta_rfc = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('nusvc', NuSVC(random_state=42, class_weight = {0: 0.1}, kernel = 'rbf'))
])

meta_rfc.fit(X_train[["text_pred", "location_pred", "keyword_pred"]], y_train)
print("model score: %.3f" % meta_rfc.score(X_test[["text_pred", "location_pred", "keyword_pred"]], y_test))

predictions = meta_rfc.predict(X_test[["text_pred", "location_pred", "keyword_pred"]])
print(classification_report(y_test, predictions))

model score: 0.806
              precision    recall  f1-score   support

           0       0.80      0.87      0.84       869
           1       0.81      0.72      0.76       654

    accuracy                           0.81      1523
   macro avg       0.81      0.80      0.80      1523
weighted avg       0.81      0.81      0.80      1523

