# IT3212 - Assignment 2

# Config

In [2]:
run_eda = False
lemmatize = False
with_sentiment = False

### Importing libraries

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [4]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


### Fix dataset encoding issues

In [5]:
# Some rows in the raw data include non UTF-8 characters. 

# Example of text with non UTF-8 characters:
# 778245336,FALSE,finalized,5,8/30/15 13:27,Not Relevant,0.7952,,army,
# text column: Pakistan,".: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: RT DrAyesha4: #IndiaKoMunTorJawabDo Indian Army ki��_ http://t.co/WJLJq3yA4g"
# ,6.29079E+17,195397186

# Chardet identifies the encoding of the raw data as 'MacRoman'.
# For now, we will remove all non UTF-8 characters from the raw data
# We handle this by removing all � characters from the raw data and writing the modified content back to the file.

def fix_non_utf8_encoding(filepath, destination_filepath):
    with open(filepath, 'rb') as file:
        rawdata = file.read()
        result = chardet.detect(rawdata)
        print(result['encoding'])


    # Open the file in read mode, read its contents, then close it
    with open('data/disaster-tweets.csv', 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()

    # Remove all � characters
    content = content.replace('�', '')

    # Open the file in write mode and write the modified content back to it
    with open(destination_filepath, 'w', encoding='utf-8') as file:
        file.write(content)

filepath = 'data/disaster-tweets.csv'
dest = 'data/disaster-tweets-utf8.csv'

# fix_non_utf8_encoding(filepath, dest)

In [6]:
def split_train_test(filepath, destination_filepath_train, destination_filepath_test):
    df = pd.read_csv(filepath, encoding='utf-8')
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)

    return train_data, test_data

def split_train_val_test(filepath):
    df = pd.read_csv(filepath, encoding='utf-8')
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
    train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
    train_data = train_data.reset_index(drop=True)
    val_data = val_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)
    return train_data, val_data, test_data

### Importing dataset

In [7]:
train, validation, test = split_train_val_test(dest)

In [8]:
train.drop([
    '_unit_id',
    '_golden',
    '_unit_state',
    '_trusted_judgments',
    '_last_judgment_at',
    'choose_one:confidence',
    'choose_one_gold',
    'tweetid',
    'userid'
    # 'choose_one',
    # 'text',
    ], axis=1, inplace=True)
train.columns

Index(['choose_one', 'keyword', 'location', 'text'], dtype='object')

In [9]:
train.shape

(6960, 4)

In [10]:
train['text'].apply(lambda x:len(str(x).split())).max()

31

## 1. Preprocessing

In [11]:
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()

def filter_rows_by_confidence_and_decision(df, confidence_threshold):
    df = df[df['choose_one:confidence'] >= confidence_threshold]
    df = df[df['choose_one'] != "Can't Decide"]
    return df

def map_choose_one_to_y(df):
    df['target'] = df['choose_one'].apply(lambda choice: 1 if choice == 'Relevant' else 0)
    return df

def clean_text(text):
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub('\s+', ' ', text).strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words("english")])
    if lemmatize:
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text = contractions.fix(text)
    text = ' '.join(tokenizer.tokenize(text))
    return text

def clean_keyword(keyword):
    return unquote(keyword) if pd.notnull(keyword) else ''

def clean_data(df):
    df['keyword'] = df['keyword'].apply(clean_keyword).apply(str.lower)
    df['text_raw'] = df['text']
    df['text'] = df['text'].apply(clean_text)
    return df

initial_count = df_train.shape[0]
confidence_threshold = 0.7

df_train = filter_rows_by_confidence_and_decision(df_train, confidence_threshold)
print("Removed {} of total: {} rows. Remaining rows: {}".format(initial_count - df_train.shape[0], initial_count, df_train.shape[0]))

features_to_keep = ['target', 'text', 'keyword']

df_train = map_choose_one_to_y(df_train)
df_train = df_train[features_to_keep]
df_train = clean_data(df_train)

count_initial = df_train.shape[0]
df_train = df_train.drop_duplicates(subset=['text'])
print("Removed {} duplicated rows.".format(count_initial - df_train.shape[0]))


# Preprocess the test data as well
df_test = map_choose_one_to_y(df_test)
df_test = df_test[features_to_keep]
df_test = clean_data(df_test)

df_test.head()


NameError: name 'WordNetLemmatizer' is not defined

## 2. Extracting features

### Features that can be extracted from the raw text

In [None]:
def extract_features(df): 
    # Create new column for text length
    df['text_length'] = df['text_raw'].apply(len)
    # Extract the number of hashtags
    df["hashtag_count"] = df["text_raw"].apply(lambda x: len([c for c in str(x) if c == "#"]))

    # Extract the number of mentions
    df["mention_count"] = df["text_raw"].apply(lambda x: len([c for c in str(x) if c == "@"]))

    # Extract the `has_url` feature
    df["has_url"] = df["text_raw"].apply(lambda x: 1 if "http" in str(x) else 0)
    return df

# Write the updated dataframe to a CSV file
df_train = extract_features(df_train)
df_test = extract_features(df_test)
df_train.head()

Unnamed: 0,target,text,keyword,text_raw,text_length,hashtag_count,mention_count,has_url
0,0,do not even remember slsp happening remember l...,screamed,i dont even remember slsp happening i just rem...,134,0,0,0
1,0,hazelannmac ooh feel guilty wishing hatman bet...,mudslide,@hazelannmac ooh now I feel guilty about wishi...,94,0,1,0
2,0,soultech collide club mix,collide,Soultech - Collide (Club Mix) http://t.co/8xIx...,52,0,0,1
3,1,police officer wounded suspect dead exchanging...,wounded,Police Officer Wounded Suspect Dead After Exch...,83,0,0,1
4,0,cramer igers 3 words wrecked disneys stock,wrecked,Cramer: Iger's 3 words that wrecked Disney's s...,73,0,0,1


### Ngrams

In [None]:
def create_ngrams(text, n):
    tokens = word_tokenize(text)
    n_grams = list(ngrams(tokens, n))
    return n_grams


df_train['bigrams'] = df_train['text'].apply(lambda x: create_ngrams(x, 2))
df_train['trigrams'] = df_train['text'].apply(lambda x: create_ngrams(x, 3))

df_test['bigrams'] = df_test['text'].apply(lambda x: create_ngrams(x, 2))
df_test['trigrams'] = df_test['text'].apply(lambda x: create_ngrams(x, 3))

In [None]:
def create_ngrams_string(ngram_list):
    ngram_words = ['_'.join(ngram) for ngram in ngram_list]
    ngram_string = ' '.join(ngram_words)
    return ngram_string

def add_ngrams(df):
    df['bigrams'] = df['bigrams'].apply(lambda x: create_ngrams_string(x))
    df['trigrams'] = df['trigrams'].apply(lambda x: create_ngrams_string(x))

    df['text_with_ngrams'] = df['text'] + ' ' +  df['bigrams'] + ' ' + df['trigrams'] 
    return df

df_train = add_ngrams(df_train)
df_test = add_ngrams(df_test)

### Word2Vec

In [None]:
tokenized_text = df_train['text_with_ngrams'].apply(lambda x: x.split())

model_w2v = gensim.models.Word2Vec(
            tokenized_text,
            vector_size=400, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2, # Ignores all words with total frequency lower than 2.                                  
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 32, # no.of cores
            seed = 34
) 

model_w2v.train(tokenized_text, total_examples= len(df_train), epochs=100)


(6569397, 15305800)

In [None]:
# Define a function that converts tokens to vectors using the Word2Vec model
def tokens_to_vectors(tokens, model, vector_size) -> np.ndarray:
    vectors = np.zeros((len(tokens), vector_size))
    for i, token in enumerate(tokens):
        try:
            vectors[i] = model.wv[token]
        except KeyError:  # Token not in the model's vocabulary
            vectors[i] = np.zeros(vector_size)
    return vectors.mean(axis=0)

# Example usage
vecs_train = [tokens_to_vectors(tokens, model_w2v, 400) for tokens in tokenized_text]
vecs_train = np.vstack(vecs_train)

vecs_test = [tokens_to_vectors(tokens, model_w2v, 400) for tokens in df_test['text'].apply(lambda x: x.split())]
vecs_test = np.vstack(vecs_test)
# # Converting the list of vectors to a DataFrame
# vectors_df = pd.DataFrame(np.vstack(vecs))
# print(vectors_df.shape)

  return vectors.mean(axis=0)
  ret = um.true_divide(


In [None]:
text_embedded_w2v_df = pd.DataFrame(vecs_train, columns=[str(x) for x in range(400)], index=df_train.index) 
text_embedded_w2f_df_test = pd.DataFrame(vecs_test, columns=[str(x) for x in range(400)], index=df_test.index)

## 3. Selecting features

In [None]:
# storing this for later we can test different features without having to re-run cells above this one
df_checkpoint = df_train.copy(deep=True)
df_test_checkpoint = df_test.copy(deep=True)

In [None]:
df_train.columns

Index(['target', 'text', 'keyword', 'text_raw', 'text_length', 'hashtag_count',
       'mention_count', 'has_url', 'bigrams', 'trigrams', 'text_with_ngrams'],
      dtype='object')

In [None]:
features_to_keep = ['target', 'text_length', 'hashtag_count', 'mention_count', 'has_url']

df_checkpoint = df_train[features_to_keep]
df_test_checkpoint = df_test[features_to_keep]


df_checkpoint = pd.concat([df_checkpoint, text_embedded_w2v_df], axis=1)
df_test_checkpoint = pd.concat([df_test_checkpoint, text_embedded_w2f_df_test], axis=1)

df_checkpoint.dropna(inplace=True)
df_test_checkpoint.dropna(inplace=True)

# extract y_train and y_test here to avoid column name collision with 'target' feature coming from text and keyword embeddings
y_train = df_checkpoint['target']
y_test = df_test_checkpoint['target']

X_train = df_checkpoint.drop(['target'], axis=1)
X_test = df_test_checkpoint.drop(['target'], axis=1)

## 4. Modelling

In [None]:
logreg = True
svm = False
xgb = False
rforest = False
lightgbm = False

In [None]:
def print_results(y_pred, y_train, y_pred_test, y_test):
    print("Train results")
    print("-----------------------------")
    print("Train accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print(classification_report(y_train, y_pred))
    print(confusion_matrix(y_train, y_pred))

    print()
    print("Test results")
    print("-----------------------------")
    print("Test accuracy: {}".format(accuracy_score(y_test, y_pred_test)))
    print(classification_report(y_test, y_pred_test))
    print(confusion_matrix(y_test, y_pred_test))


### 4.1 Logistic regression

In [None]:
if logreg:
    logreg = LogisticRegression(random_state=42, solver="liblinear")
    logreg.fit(X_train, y_train)

    y_pred = cross_val_predict(logreg, X_train, y_train, cv=5)  # 5-fold cross-validation
    y_pred_test = logreg.predict(X_test)

    print_results(y_pred, y_train, y_pred_test, y_test)

Train results
-----------------------------
Train accuracy: 0.8475496014922842
              precision    recall  f1-score   support

           0       0.84      0.92      0.88      3595
           1       0.85      0.74      0.79      2302

    accuracy                           0.85      5897
   macro avg       0.85      0.83      0.84      5897
weighted avg       0.85      0.85      0.85      5897

[[3305  290]
 [ 609 1693]]

Test results
-----------------------------
Test accuracy: 0.7665441176470589
              precision    recall  f1-score   support

           0       0.82      0.75      0.78      1219
           1       0.71      0.79      0.75       957

    accuracy                           0.77      2176
   macro avg       0.77      0.77      0.77      2176
weighted avg       0.77      0.77      0.77      2176

[[912 307]
 [201 756]]


### 4.1.2. Support Vector Machines

In [None]:
if svm:
    first_n = 1000

    # Initialize SVM model
    svm_model = SVC(kernel='linear', C=1, random_state=42, probability=False)

    # Fit the model on training data
    svm_model.fit(X_train, y_train)

    # Use 5-fold cross-validation to get predictions on training set
    y_pred_train = cross_val_predict(svm_model, X_train, y_train, cv=5)
    y_pred_test = svm_model.predict(X_test)

    print_results(y_pred_train, y_train, y_pred_test, y_test)

Train results
-----------------------------
Train accuracy: 0.8494149567576734
              precision    recall  f1-score   support

           0       0.85      0.92      0.88      3595
           1       0.86      0.74      0.79      2302

    accuracy                           0.85      5897
   macro avg       0.85      0.83      0.84      5897
weighted avg       0.85      0.85      0.85      5897

[[3315  280]
 [ 608 1694]]

Test results
-----------------------------
Test accuracy: 0.7614889705882353
              precision    recall  f1-score   support

           0       0.82      0.74      0.78      1219
           1       0.71      0.79      0.74       957

    accuracy                           0.76      2176
   macro avg       0.76      0.76      0.76      2176
weighted avg       0.77      0.76      0.76      2176

[[904 315]
 [204 753]]


In [1]:
if xgb:
    import xgboost as xgb
    from sklearn.metrics import accuracy_score, classification_report

    params_edited = {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.9}

    #Creating an XGBoost classifier
    model = xgb.XGBClassifier(params_edited)

    #Training the model on the training data
    model.fit(X_train, y_train)

    #Making predictions on the test set
    predictions = model.predict(X_test)

    #Calculating accuracy
    accuracy = accuracy_score(y_test, predictions)

    print("Accuracy:", accuracy)
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))

NameError: name 'xgb' is not defined

In [27]:
if rforest:
    from sklearn.ensemble import RandomForestClassifier

    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))



Accuracy: 0.734375

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.78      0.77      1219
           1       0.71      0.68      0.69       957

    accuracy                           0.73      2176
   macro avg       0.73      0.73      0.73      2176
weighted avg       0.73      0.73      0.73      2176



In [29]:
# catboost

import catboost as cb
from catboost import CatBoostClassifier, Pool

# Initialize CatBoostClassifier

model = CatBoostClassifier(iterations=2,
                            learning_rate=1,
                            depth=2,
                            loss_function='Logloss',
                            verbose=True)

# Fit model
model.fit(X_train, y_train)

# Get predicted classes
preds_class = model.predict(X_test)

# print results
print("Accuracy:", accuracy_score(y_test, preds_class))
print("\nClassification Report:")
print(classification_report(y_test, preds_class))




0:	learn: 0.5803965	total: 60ms	remaining: 60ms
1:	learn: 0.5374717	total: 65.4ms	remaining: 0us
Accuracy: 0.6677389705882353

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.69      0.70      1219
           1       0.62      0.63      0.63       957

    accuracy                           0.67      2176
   macro avg       0.66      0.66      0.66      2176
weighted avg       0.67      0.67      0.67      2176



In [28]:
# lightgbm

import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

#Creating an XGBoost classifier
model = lgb.LGBMClassifier()

#Training the model on the training data
model.fit(X_train, y_train)

#Making predictions on the test set
predictions = model.predict(X_test)

#Calculating accuracy
accuracy = accuracy_score(y_test, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, predictions))

AttributeError: module 'pandas.core.strings' has no attribute 'StringMethods'