# IT3212

## Config

In [28]:
run_eda = False
lemmatize = False
with_sentiment = False

text_embedding = {
    'tfidf': True,
    'word2vec': False,
    'bert': False,
}

hyperparameter_tuning = {
  'adaboost': True,
}

## Importing libraries

In [29]:
# Standard libraries
import numpy as np
import pandas as pd
import re
import string

# NLTK tools and datasets
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Uncomment if you need to download NLTK data packages
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('vader_lexicon')

# Text processing
from textblob import TextBlob
import contractions

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, roc_curve, auc)
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score


# Miscellaneous
from collections import Counter
from urllib.parse import unquote
from scipy import stats
import chardet

from transformers import BertTokenizer, BertModel



### Fix dataset encoding issues

In [30]:
# Some rows in the raw data include non UTF-8 characters. 

# Example of text with non UTF-8 characters:
# 778245336,FALSE,finalized,5,8/30/15 13:27,Not Relevant,0.7952,,army,
# text column: Pakistan,".: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: RT DrAyesha4: #IndiaKoMunTorJawabDo Indian Army ki��_ http://t.co/WJLJq3yA4g"
# ,6.29079E+17,195397186

# Chardet identifies the encoding of the raw data as 'MacRoman'.
# For now, we will remove all non UTF-8 characters from the raw data
# We handle this by removing all � characters from the raw data and writing the modified content back to the file.

def fix_non_utf8_encoding(filepath, destination_filepath):
    with open(filepath, 'rb') as file:
        rawdata = file.read()
        result = chardet.detect(rawdata)
        print(result['encoding'])


    # Open the file in read mode, read its contents, then close it
    with open('data/disaster-tweets.csv', 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()

    # Remove all � characters
    content = content.replace('�', '')

    # Open the file in write mode and write the modified content back to it
    with open(destination_filepath, 'w', encoding='utf-8') as file:
        file.write(content)

filepath = 'data/disaster-tweets.csv'
dest = 'data/disaster-tweets-utf8.csv'

# fix_non_utf8_encoding(filepath, dest)

In [31]:
def split_train_test(filepath):
    df = pd.read_csv(filepath, encoding='utf-8')
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)
    return train_data, test_data

filepath = 'data/disaster-tweets-utf8.csv'

### Importing dataset

In [32]:
import_remote = False

if import_remote:
    df_train = pd.read_csv('https://raw.githubusercontent.com/magnusrodseth/it3212/main/data/train.csv', encoding='utf-8')
    df_test = pd.read_csv('https://raw.githubusercontent.com/magnusrodseth/it3212/main/data/test.csv', encoding='utf-8')
else:
    df_train, df_test = split_train_test(filepath)

df_train.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778253309,False,finalized,5,8/27/15 16:07,Not Relevant,1.0,,screamed,,i dont even remember slsp happening i just remember being like wtf and then the lights turned off and everyone screamed for the encore,6.29107e+17,232773900.0
1,778251995,False,finalized,5,8/27/15 20:16,Not Relevant,1.0,,mudslide,Edinburgh,@hazelannmac ooh now I feel guilty about wishing hatman out. I bet the mudslide was delicious!,6.29018e+17,27502200.0
2,778247239,False,finalized,5,8/30/15 0:15,Not Relevant,1.0,,collide,planeta H2o,Soultech - Collide (Club Mix) http://t.co/8xIxBsPOT8,6.29092e+17,605238700.0
3,778255430,False,finalized,5,8/27/15 17:03,Relevant,0.7978,,wounded,,Police Officer Wounded Suspect Dead After Exchanging Shots - http://t.co/iPHaZV47g7,6.29119e+17,2305930000.0
4,778255609,False,finalized,5,8/27/15 22:11,Not Relevant,1.0,,wrecked,Sunny Southern California,Cramer: Iger's 3 words that wrecked Disney's stock http://t.co/4dGpBAiVL7,6.2908e+17,24642660.0


##  Exploratory data analysis (EDA)

In [33]:
if run_eda:
    # Clean `keyword` column.

    # Write the updated dataframe to a new CSV file
    # Plot the most common keywords
    defined_keywords = df_train[df_train['keyword'] != '']['keyword']

    plt.figure()
    sns.countplot(y=defined_keywords, order=defined_keywords.value_counts().iloc[:10].index)
    plt.title('Most Common Keywords')
    plt.xlabel('Count')
    plt.ylabel('Keyword')
    plt.tight_layout()
    plt.show()

In [34]:
if run_eda:
    # Compare keywords for disaster tweets and non-disaster tweets
    disaster_keywords = df_train[df_train['choose_one'] == 'Relevant']['keyword']
    non_disaster_keywords = df_train[df_train['choose_one'] == 'Not Relevant']['keyword']

    # Create a figure object and define the grid
    fig, ax = plt.subplots(1, 2, figsize=(14, 6))  # 1 row, 2 columns

    # Plotting
    sns.countplot(y=disaster_keywords, ax=ax[0], order=disaster_keywords.value_counts().iloc[:10].index, color='red')
    sns.countplot(y=non_disaster_keywords, ax=ax[1], order=non_disaster_keywords.value_counts().iloc[:10].index, color='blue')

    # Titles and labels
    ax[0].set_title('Most Common Keywords for Disaster Tweets')
    ax[0].set_xlabel('Count')
    ax[0].set_ylabel('Keyword')

    ax[1].set_title('Most Common Keywords for Non-Disaster Tweets')
    ax[1].set_xlabel('Count')
    ax[1].set_ylabel('Keyword')

    # Adjust layout
    plt.tight_layout()
    plt.show()



In the plot above, we can see that the top 10 shared keywords of disaster-related tweets and non-disaster-related tweets do not share any common keywords.

## 1. Preprocessing

In [35]:
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()

def filter_rows_by_confidence_and_decision(df, confidence_threshold):
    df = df[df['choose_one:confidence'] >= confidence_threshold]
    df = df[df['choose_one'] != "Can't Decide"]
    return df

def map_choose_one_to_y(df):
    df['target'] = df['choose_one'].apply(lambda choice: 1 if choice == 'Relevant' else 0)
    return df

def clean_text(text):
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub('\s+', ' ', text).strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words("english")])
    if lemmatize:
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text = contractions.fix(text)
    text = ' '.join(tokenizer.tokenize(text))
    return text

def clean_keyword(keyword):
    return unquote(keyword) if pd.notnull(keyword) else ''

def clean_data(df):
    df['keyword'] = df['keyword'].apply(clean_keyword).apply(str.lower)
    df['text_raw'] = df['text']
    df['text'] = df['text'].apply(clean_text)
    return df

initial_count = df_train.shape[0]
confidence_threshold = 0.7

df_train = filter_rows_by_confidence_and_decision(df_train, confidence_threshold)
print("Removed {} of total: {} rows. Remaining rows: {}".format(initial_count - df_train.shape[0], initial_count, df_train.shape[0]))

features_to_keep = ['target', 'text', 'keyword']

df_train = map_choose_one_to_y(df_train)
df_train = df_train[features_to_keep]
df_train = clean_data(df_train)

count_initial = df_train.shape[0]
df_train = df_train.drop_duplicates(subset=['text'])
print("Removed {} duplicated rows.".format(count_initial - df_train.shape[0]))


# Preprocess the test data as well
df_test = map_choose_one_to_y(df_test)
df_test = df_test[features_to_keep]
df_test = clean_data(df_test)

df_test.head()


Removed 2167 of total: 8700 rows. Remaining rows: 6533


Removed 635 duplicated rows.


Unnamed: 0,target,text,keyword,text_raw
0,1,sunset looked like erupting volcano initial thought pixar short lava,volcano,The sunset looked like an erupting volcano .... My initial thought was the Pixar short Lava http://t.co/g4sChqFEsT
1,1,7294 nikon d50 61 mp digital slr camera body 2 batteries carry bag charger 20000,body bag,#7294 Nikon D50 6.1 MP Digital SLR Camera Body 2 batteries carry bag and charger http://t.co/SL7PHqSGKV\n\n$200.00\n_ http://t.co/T4Qh2OM8Op
2,0,mentaltwitter note make sure smoke alarm battery snuff times face many twitter reminders changing battery,smoke,Mental/Twitter Note: Make sure my smoke alarm battery is up to snuff at all times or face many twitter reminders of changing my battery.
3,0,emergency need part 2 3 nashnewvideo nashgrier 103,emergency,?????? EMERGENCY ?????? NEED PART 2 and 3!!! #NashNewVideo http://t.co/TwdnNaIOns @Nashgrier 103
4,0,whelen model 295ss100 siren amplifier police emergency vehicle full read ebay,siren,WHELEN MODEL 295SS-100 SIREN AMPLIFIER POLICE EMERGENCY VEHICLE - Full read by eBay http://t.co/Q3yYQi4A27 http://t.co/whEreofYAx


## 2. Extracting features

### Miscellanous features from `text` column

In [36]:
def extract_features(df): 
    # Create new column for text length
    df['text_length'] = df['text_raw'].apply(len)
    # Extract the number of hashtags
    df["hashtag_count"] = df["text_raw"].apply(lambda x: len([c for c in str(x) if c == "#"]))

    # Extract the number of mentions
    df["mention_count"] = df["text_raw"].apply(lambda x: len([c for c in str(x) if c == "@"]))

    # Extract the `has_url` feature
    df["has_url"] = df["text_raw"].apply(lambda x: 1 if "http" in str(x) else 0)
    return df

# Write the updated dataframe to a CSV file
df_train = extract_features(df_train)
df_test = extract_features(df_test)
df_train.head()

Unnamed: 0,target,text,keyword,text_raw,text_length,hashtag_count,mention_count,has_url
0,0,do not even remember slsp happening remember like wtf lights turned everyone screamed encore,screamed,i dont even remember slsp happening i just remember being like wtf and then the lights turned off and everyone screamed for the encore,134,0,0,0
1,0,hazelannmac ooh feel guilty wishing hatman bet mudslide delicious,mudslide,@hazelannmac ooh now I feel guilty about wishing hatman out. I bet the mudslide was delicious!,94,0,1,0
2,0,soultech collide club mix,collide,Soultech - Collide (Club Mix) http://t.co/8xIxBsPOT8,52,0,0,1
3,1,police officer wounded suspect dead exchanging shots,wounded,Police Officer Wounded Suspect Dead After Exchanging Shots - http://t.co/iPHaZV47g7,83,0,0,1
4,0,cramer igers 3 words wrecked disneys stock,wrecked,Cramer: Iger's 3 words that wrecked Disney's stock http://t.co/4dGpBAiVL7,73,0,0,1


### N-grams

In [37]:
def create_ngrams(text, n):
    tokens = word_tokenize(text)
    n_grams = list(ngrams(tokens, n))
    return ['_'.join(ngram) for ngram in n_grams]

def add_ngrams_to_text(text):
    bigrams_string = ' '.join(create_ngrams(text, 2))
    trigrams_string = ' '.join(create_ngrams(text, 3))
    return text + ' ' + bigrams_string + ' ' + trigrams_string

def add_ngrams_to_df(df):
    df['text_with_ngrams'] = df['text'].apply(add_ngrams_to_text)
    return df

# Apply to DataFrames
df_train = add_ngrams_to_df(df_train)
df_test = add_ngrams_to_df(df_test)

# print full rows
pd.set_option('display.max_colwidth', None)
df_train[['text_with_ngrams', 'text_raw']].head(2)

Unnamed: 0,text_with_ngrams,text_raw
0,do not even remember slsp happening remember like wtf lights turned everyone screamed encore do_not not_even even_remember remember_slsp slsp_happening happening_remember remember_like like_wtf wtf_lights lights_turned turned_everyone everyone_screamed screamed_encore do_not_even not_even_remember even_remember_slsp remember_slsp_happening slsp_happening_remember happening_remember_like remember_like_wtf like_wtf_lights wtf_lights_turned lights_turned_everyone turned_everyone_screamed everyone_screamed_encore,i dont even remember slsp happening i just remember being like wtf and then the lights turned off and everyone screamed for the encore
1,hazelannmac ooh feel guilty wishing hatman bet mudslide delicious hazelannmac_ooh ooh_feel feel_guilty guilty_wishing wishing_hatman hatman_bet bet_mudslide mudslide_delicious hazelannmac_ooh_feel ooh_feel_guilty feel_guilty_wishing guilty_wishing_hatman wishing_hatman_bet hatman_bet_mudslide bet_mudslide_delicious,@hazelannmac ooh now I feel guilty about wishing hatman out. I bet the mudslide was delicious!


In [38]:
if text_embedding['bert']:
    df_train_text_embedded_bert = df_train.copy()
    df_test_text_embedded_bert = df_test.copy()

    # Load pre-trained BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def get_bert_embeddings(text: str):
        inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")
        outputs = model(**inputs)
        return outputs.last_hidden_state[:,0,:].detach().numpy()

    # Generate BERT embeddings
    df_train_text_embedded_bert['bert_embeddings'] = df_train_text_embedded_bert['text'].apply(get_bert_embeddings)
    df_test_text_embedded_bert['bert_embeddings'] = df_test_text_embedded_bert['text'].apply(get_bert_embeddings)


### Embedding `text` using `TF-IDF`

In [39]:
if text_embedding['tfidf']:
    vectorizer = TfidfVectorizer(max_features=1000)

    feature_to_embed = 'text_with_ngrams'

    # Fit and transform the training data
    text_embedded = vectorizer.fit_transform(df_train[feature_to_embed])
    df_train_text_embedded_tfidf = pd.DataFrame(text_embedded.toarray(), columns=vectorizer.get_feature_names_out(), index=df_train.index)

    # Transform the test data using the same vectorizer
    text_embedded_test = vectorizer.transform(df_test[feature_to_embed])
    df_test_text_embedded_tfidf = pd.DataFrame(text_embedded_test.toarray(), columns=vectorizer.get_feature_names_out(), index=df_test.index)

### Embedding `text` column using `Word2Vec`

#### Config

In [40]:
w2v_config = {
    "vector_size": 200,
    "with_ngrams": True
}

#### Create model

In [41]:
# import gensim

# if text_embedding['word2vec']:
#     if w2v_config['with_ngrams']:
#         tokenized_text = df_train['text_with_ngrams'].apply(lambda x: x.split())
#     else:
#         tokenized_text = df_train['text'].apply(lambda x: x.split())

# import gensim.downloader as api
# model_w2v = api.load("glove-twitter-200")

# model_w2v = gensim.models.Word2Vec(
#             tokenized_text,
#             vector_size=w2v_conifig['vector_size'], # desired no. of features/independent variables
#             window=5, # context window size
#             min_count=2, # Ignores all words with total frequency lower than 2.                                  
#             sg = 1, # 1 for skip-gram model, 0 for CBOW
#             negative = 10, # for negative sampling
#             workers= 8, # no.of cores
#             seed = 34
# ) 

# https://towardsdatascience.com/nlp-101-word2vec-skip-gram-and-cbow-93512ee24314
# Skip-gram: works well with a small amount of the training data, represents well even rare words or phrases.
# CBOW: several times faster to train than the skip-gram, slightly better accuracy for the frequent words.

# model_w2v.train(tokenized_text, total_examples= len(df_train), epochs=100)


#### Create embeddings from `text`

In [42]:
if text_embedding['word2vec']:
    def embed_text_feature(df, col, model, vector_size):
        def tokens_to_vectors(text_tokens) -> np.ndarray:
            vectors = np.zeros((len(text_tokens), vector_size))

            # embed each token (word-ish) in the text. If the token is not in the model's vocabulary, embed it as a zero vector.
            for i, token in enumerate(text_tokens):
                try:
                    vectors[i] = model[token]
                except KeyError:  # Token not in the model's vocabulary
                    vectors[i] = np.zeros(vector_size)

            # if all tokens were zero vectors, i.e. all words not in the model's vocabulary, return a zero vector
            if np.all(vectors == 0):
                return np.zeros(vector_size)
            
            return vectors.mean(axis=0)

        embeddings = []
        for tokens in df[col].apply(lambda x: x.split()):
            embeddings.append(tokens_to_vectors(tokens))

        return pd.DataFrame(np.vstack(embeddings), columns=[f'{col}_w2v_{i}' for i in range(vector_size)])

    df_train_text_embedded_w2v = embed_text_feature(df_train, 'text', model_w2v, w2v_config['vector_size'])
    df_test_text_embedded_w2v = embed_text_feature(df_test, 'text', model_w2v, w2v_config['vector_size'])

    df_train_text_embedded_w2v.shape

## 3. Selecting features

In [43]:
df_train.columns

Index(['target', 'text', 'keyword', 'text_raw', 'text_length', 'hashtag_count',
       'mention_count', 'has_url', 'text_with_ngrams'],
      dtype='object')

In [44]:
features_to_keep = ['target', 'text_length', 'hashtag_count', 'mention_count', 'has_url']

X_train = pd.concat([
    df_train[features_to_keep], 
    # df_train_text_embedded_w2v,
    df_train_text_embedded_tfidf,
    # df_train_text_embedded_bert
    ], axis=1)
X_test = pd.concat([
    df_test[features_to_keep], 
    # df_test_text_embedded_w2v,
    df_test_text_embedded_tfidf,
    # df_test_text_embedded_bert
    ], axis=1)

X_train.dropna(inplace=True)

# extract y_train and y_test here to avoid column name collision with 'target' feature coming from text and keyword embeddings
y_train = X_train['target']
y_test = X_test['target']

X_train.drop(['target'], axis=1, inplace=True)
X_test.drop(['target'], axis=1, inplace=True)

X_train.shape

(5898, 1004)

## 4. Modelling

In [45]:
logreg = True
svm = False
xgb = True
random_forest = False

In [46]:
def print_results(y_pred, y_train, y_pred_test, y_test):
    print("Train results")
    print("-----------------------------")
    print("Train accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print(classification_report(y_train, y_pred))
    print(confusion_matrix(y_train, y_pred))

    print()
    print("Test results")
    print("-----------------------------")
    print("Test accuracy: {}".format(accuracy_score(y_test, y_pred_test)))
    print(classification_report(y_test, y_pred_test))
    print(confusion_matrix(y_test, y_pred_test))


### 4.1 Logistic regression

In [47]:
if logreg:
    logreg = LogisticRegression(random_state=42, solver="liblinear")
    logreg.fit(X_train, y_train)

    y_pred = cross_val_predict(logreg, X_train, y_train, cv=5)  # 5-fold cross-validation
    y_pred_test = logreg.predict(X_test)

    print_results(y_pred, y_train, y_pred_test, y_test)

Train results
-----------------------------
Train accuracy: 0.8450322143099356
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      3596
           1       0.84      0.74      0.79      2302

    accuracy                           0.85      5898
   macro avg       0.84      0.83      0.83      5898
weighted avg       0.84      0.85      0.84      5898

[[3280  316]
 [ 598 1704]]

Test results
-----------------------------
Test accuracy: 0.7881433823529411
              precision    recall  f1-score   support

           0       0.78      0.88      0.82      1219
           1       0.81      0.68      0.74       957

    accuracy                           0.79      2176
   macro avg       0.79      0.78      0.78      2176
weighted avg       0.79      0.79      0.79      2176

[[1068  151]
 [ 310  647]]


In [48]:
# test manual prediction

test_df = pd.DataFrame({
    'text': ['car crash accident explosion fire'],
    'keyword': 'test',
})

test_df = clean_data(test_df)
test_df = extract_features(test_df)
test_df = add_ngrams_to_df(test_df)

feature_to_embed = 'text_with_ngrams'
vectors = vectorizer.transform(test_df[feature_to_embed])
text_embedded = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out(), index=test_df.index)

display(test_df)

# drop text_raw and text_with_ngrams
test_df.drop(['text_raw', 'text_with_ngrams', 'keyword', 'text'], axis=1, inplace=True)

test_df = pd.concat([
    test_df,
    text_embedded
    ], axis=1)

y_pred_test = logreg.predict_proba(test_df)

print(y_pred_test)

Unnamed: 0,text,keyword,text_raw,text_length,hashtag_count,mention_count,has_url,text_with_ngrams
0,car crash accident explosion fire,test,car crash accident explosion fire,33,0,0,0,car crash accident explosion fire car_crash crash_accident accident_explosion explosion_fire car_crash_accident crash_accident_explosion accident_explosion_fire


[[0.1672569 0.8327431]]


### 4.1.2. Support Vector Machines

In [49]:
if svm:
    first_n = 1000

    # Initialize SVM model
    svm_model = SVC(kernel='linear', C=1, random_state=42, probability=False)

    # Fit the model on training data
    svm_model.fit(X_train, y_train)

    # Use 5-fold cross-validation to get predictions on training set
    y_pred_train = cross_val_predict(svm_model, X_train, y_train, cv=5)
    y_pred_test = svm_model.predict(X_test)

    print_results(y_pred_train, y_train, y_pred_test, y_test)

In [50]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Creating an XGBoost classifier
model = xgb.XGBClassifier()

# Training the model on the training data
model.fit(X_train, y_train)

# Making predictions on the test set
predictions_test = model.predict(X_test)

# Calculating accuracy on test set
accuracy_test = accuracy_score(y_test, predictions_test)

# Making predictions on the training set
predictions_train = model.predict(X_train)

# Calculating accuracy on training set
accuracy_train = accuracy_score(y_train, predictions_train)

print("\nTraining Set Metrics:")
print("Accuracy:", accuracy_train)
print("\nClassification Report:")
print(classification_report(y_train, predictions_train))

print("\nTest Set Metrics:")
print("Accuracy:", accuracy_test)
print("\nClassification Report:")
print(classification_report(y_test, predictions_test))




Training Set Metrics:
Accuracy: 0.8999660902000678

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      3596
           1       0.96      0.77      0.86      2302

    accuracy                           0.90      5898
   macro avg       0.92      0.88      0.89      5898
weighted avg       0.91      0.90      0.90      5898


Test Set Metrics:
Accuracy: 0.7844669117647058

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.91      0.83      1219
           1       0.84      0.63      0.72       957

    accuracy                           0.78      2176
   macro avg       0.80      0.77      0.77      2176
weighted avg       0.79      0.78      0.78      2176



In [51]:
if random_forest:
    from sklearn.ensemble import RandomForestClassifier

    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

In [52]:
from sklearn.ensemble import AdaBoostClassifier

# Initialize AdaBoost Classifier
adaboost = AdaBoostClassifier(random_state=0)

# Perform 5-fold cross-validation on the training data
y_pred_cv = cross_val_predict(adaboost, X_train, y_train, cv=5)

# Fit the model on the entire training data
adaboost.fit(X_train, y_train)

# Predict on the test data
y_pred_test = adaboost.predict(X_test)

# Evaluate and print the results
print_results(y_pred_cv, y_train, y_pred_test, y_test)


Train results
-----------------------------
Train accuracy: 0.7848423194303153
              precision    recall  f1-score   support

           0       0.77      0.93      0.84      3596
           1       0.84      0.56      0.67      2302

    accuracy                           0.78      5898
   macro avg       0.80      0.74      0.75      5898
weighted avg       0.79      0.78      0.77      5898

[[3348  248]
 [1021 1281]]

Test results
-----------------------------
Test accuracy: 0.7293198529411765
              precision    recall  f1-score   support

           0       0.70      0.90      0.79      1219
           1       0.80      0.52      0.63       957

    accuracy                           0.73      2176
   macro avg       0.75      0.71      0.71      2176
weighted avg       0.74      0.73      0.72      2176

[[1093  126]
 [ 463  494]]


In [53]:
# Check for data imbalance. Check how many tweets are relevant and how many are not relevant.
relevant_tweets = df_train[df_train['target'] == 1]
not_relevant_tweets = df_train[df_train['target'] == 0]

print("Relevant tweets: {}".format(relevant_tweets.shape[0]))
print("Not relevant tweets: {}".format(not_relevant_tweets.shape[0]))


Relevant tweets: 2302
Not relevant tweets: 3596


In [54]:
from sklearn.model_selection import GridSearchCV

if hyperparameter_tuning['adaboost']:
    # Define the parameter grid
    param_grid = {
        'n_estimators': [200, 250, 300],
        'learning_rate': [0.01, 0.1, 1, 2, 3],
        'algorithm': ['SAMME', 'SAMME.R']
    }

    # Create GridSearchCV object
    grid_search = GridSearchCV(estimator=adaboost, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Print best parameters
    print("Best Parameters:", grid_search.best_params_)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Generate cross-validated predictions for the training set
    y_pred_train = cross_val_predict(best_model, X_train, y_train, cv=5)

    # Predict on the test set
    y_pred_test = best_model.predict(X_test)

    print_results(y_pred_train, y_train, y_pred_test, y_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=200; total time=  24.6s
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=200; total time=  24.7s
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=200; total time=  24.8s
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=200; total time=  25.0s
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=200; total time=  25.1s
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=250; total time=  30.8s
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=250; total time=  31.1s
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=250; total time=  31.1s
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=250; total time=  31.1s
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=250; total time=  31.1s
[CV] END algorithm=SAMME, learning_rate=0.01, n_estimators=300; total time=  38.1s
[CV] END algorithm=SAMME,