# IT3212 - Assignment 2

# Config

In [304]:
run_eda = False
lemmatize = False
with_sentiment = False

### Importing libraries

In [305]:
# Standard libraries
import numpy as np
import pandas as pd
import re
import string

# NLTK tools and datasets
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Uncomment if you need to download NLTK data packages
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('vader_lexicon')

# Text processing
from textblob import TextBlob
import contractions

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, roc_curve, auc)
from sklearn.model_selection import cross_val_predict

# Miscellaneous
from collections import Counter
from urllib.parse import unquote
from scipy import stats
import chardet


### Fix dataset encoding issues

In [306]:
# Some rows in the raw data include non UTF-8 characters. 

# Example of text with non UTF-8 characters:
# 778245336,FALSE,finalized,5,8/30/15 13:27,Not Relevant,0.7952,,army,
# text column: Pakistan,".: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: .: RT DrAyesha4: #IndiaKoMunTorJawabDo Indian Army ki��_ http://t.co/WJLJq3yA4g"
# ,6.29079E+17,195397186

# Chardet identifies the encoding of the raw data as 'MacRoman'.
# For now, we will remove all non UTF-8 characters from the raw data
# We handle this by removing all � characters from the raw data and writing the modified content back to the file.

def fix_non_utf8_encoding(filepath, destination_filepath):
    with open(filepath, 'rb') as file:
        rawdata = file.read()
        result = chardet.detect(rawdata)
        print(result['encoding'])


    # Open the file in read mode, read its contents, then close it
    with open('data/disaster-tweets.csv', 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()

    # Remove all � characters
    content = content.replace('�', '')

    # Open the file in write mode and write the modified content back to it
    with open(destination_filepath, 'w', encoding='utf-8') as file:
        file.write(content)

filepath = 'data/disaster-tweets.csv'
dest = 'data/disaster-tweets-utf8.csv'

# fix_non_utf8_encoding(filepath, dest)

In [307]:
def split_train_test(filepath, destination_filepath_train, destination_filepath_test):
    df = pd.read_csv(filepath, encoding='utf-8')
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)
    train_data.to_csv(destination_filepath_train, index=False)
    test_data.to_csv(destination_filepath_test, index=False)

filepath = 'data/disaster-tweets-utf8.csv'
dest_train = 'data/train.csv'
dest_test = 'data/test.csv'

# split_train_test(filepath, dest_train, dest_test)

### Importing dataset

In [308]:
import_remote = True

if import_remote:
    df_train = pd.read_csv('https://raw.githubusercontent.com/magnusrodseth/it3212/main/data/train.csv', encoding='utf-8')
    df_test = pd.read_csv('https://raw.githubusercontent.com/magnusrodseth/it3212/main/data/test.csv', encoding='utf-8')
else:
    df_train = pd.read_csv('./data/train.csv', encoding='utf-8')
    df_test = pd.read_csv('./data/test.csv', encoding='utf-8')

df_train.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778253309,False,finalized,5,8/27/15 16:07,Not Relevant,1.0,,screamed,,i dont even remember slsp happening i just remember being like wtf and then the lights turned off and everyone screamed for the encore,6.29107e+17,232773900.0
1,778251995,False,finalized,5,8/27/15 20:16,Not Relevant,1.0,,mudslide,Edinburgh,@hazelannmac ooh now I feel guilty about wishing hatman out. I bet the mudslide was delicious!,6.29018e+17,27502200.0
2,778247239,False,finalized,5,8/30/15 0:15,Not Relevant,1.0,,collide,planeta H2o,Soultech - Collide (Club Mix) http://t.co/8xIxBsPOT8,6.29092e+17,605238700.0
3,778255430,False,finalized,5,8/27/15 17:03,Relevant,0.7978,,wounded,,Police Officer Wounded Suspect Dead After Exchanging Shots - http://t.co/iPHaZV47g7,6.29119e+17,2305930000.0
4,778255609,False,finalized,5,8/27/15 22:11,Not Relevant,1.0,,wrecked,Sunny Southern California,Cramer: Iger's 3 words that wrecked Disney's stock http://t.co/4dGpBAiVL7,6.2908e+17,24642660.0


##  Exploratory data analysis (EDA)

In [309]:
if run_eda:
    # Clean `keyword` column.

    # Write the updated dataframe to a new CSV file
    # Plot the most common keywords
    defined_keywords = df_train[df_train['keyword'] != '']['keyword']

    plt.figure()
    sns.countplot(y=defined_keywords, order=defined_keywords.value_counts().iloc[:10].index)
    plt.title('Most Common Keywords')
    plt.xlabel('Count')
    plt.ylabel('Keyword')
    plt.tight_layout()
    plt.show()

In [310]:
if run_eda:
    # Compare keywords for disaster tweets and non-disaster tweets
    disaster_keywords = df_train[df_train['choose_one'] == 'Relevant']['keyword']
    non_disaster_keywords = df_train[df_train['choose_one'] == 'Not Relevant']['keyword']

    # Create a figure object and define the grid
    fig, ax = plt.subplots(1, 2, figsize=(14, 6))  # 1 row, 2 columns

    # Plotting
    sns.countplot(y=disaster_keywords, ax=ax[0], order=disaster_keywords.value_counts().iloc[:10].index, color='red')
    sns.countplot(y=non_disaster_keywords, ax=ax[1], order=non_disaster_keywords.value_counts().iloc[:10].index, color='blue')

    # Titles and labels
    ax[0].set_title('Most Common Keywords for Disaster Tweets')
    ax[0].set_xlabel('Count')
    ax[0].set_ylabel('Keyword')

    ax[1].set_title('Most Common Keywords for Non-Disaster Tweets')
    ax[1].set_xlabel('Count')
    ax[1].set_ylabel('Keyword')

    # Adjust layout
    plt.tight_layout()
    plt.show()



In the plot above, we can see that the top 10 shared keywords of disaster-related tweets and non-disaster-related tweets do not share any common keywords.

## 1. Preprocessing

In [311]:
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()

def filter_rows_by_confidence_and_decision(df, confidence_threshold):
    df = df[df['choose_one:confidence'] >= confidence_threshold]
    df = df[df['choose_one'] != "Can't Decide"]
    return df

def map_choose_one_to_y(df):
    df['target'] = df['choose_one'].apply(lambda choice: 1 if choice == 'Relevant' else 0)
    return df

def clean_text(text):
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub('\s+', ' ', text).strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words("english")])
    if lemmatize:
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text = contractions.fix(text)
    text = ' '.join(tokenizer.tokenize(text))
    return text

def clean_keyword(keyword):
    return unquote(keyword) if pd.notnull(keyword) else ''

def clean_data(df):
    df['keyword'] = df['keyword'].apply(clean_keyword).apply(str.lower)
    df['text_raw'] = df['text']
    df['text'] = df['text'].apply(clean_text)
    return df

initial_count = df_train.shape[0]
confidence_threshold = 0.7

df_train = filter_rows_by_confidence_and_decision(df_train, confidence_threshold)
print("Removed {} of total: {} rows. Remaining rows: {}".format(initial_count - df_train.shape[0], initial_count, df_train.shape[0]))

features_to_keep = ['target', 'text', 'keyword']

df_train = map_choose_one_to_y(df_train)
df_train = df_train[features_to_keep]
df_train = clean_data(df_train)

count_initial = df_train.shape[0]
df_train = df_train.drop_duplicates(subset=['text'])
print("Removed {} duplicated rows.".format(count_initial - df_train.shape[0]))


# Preprocess the test data as well
df_test = map_choose_one_to_y(df_test)
df_test = df_test[features_to_keep]
df_test = clean_data(df_test)

df_test.head()


Removed 2167 of total: 8700 rows. Remaining rows: 6533
Removed 635 duplicated rows.


Unnamed: 0,target,text,keyword,text_raw
0,1,sunset looked like erupting volcano initial thought pixar short lava,volcano,The sunset looked like an erupting volcano .... My initial thought was the Pixar short Lava http://t.co/g4sChqFEsT
1,1,7294 nikon d50 61 mp digital slr camera body 2 batteries carry bag charger 20000,body bag,#7294 Nikon D50 6.1 MP Digital SLR Camera Body 2 batteries carry bag and charger http://t.co/SL7PHqSGKV\n\n$200.00\n_ http://t.co/T4Qh2OM8Op
2,0,mentaltwitter note make sure smoke alarm battery snuff times face many twitter reminders changing battery,smoke,Mental/Twitter Note: Make sure my smoke alarm battery is up to snuff at all times or face many twitter reminders of changing my battery.
3,0,emergency need part 2 3 nashnewvideo nashgrier 103,emergency,?????? EMERGENCY ?????? NEED PART 2 and 3!!! #NashNewVideo http://t.co/TwdnNaIOns @Nashgrier 103
4,0,whelen model 295ss100 siren amplifier police emergency vehicle full read ebay,siren,WHELEN MODEL 295SS-100 SIREN AMPLIFIER POLICE EMERGENCY VEHICLE - Full read by eBay http://t.co/Q3yYQi4A27 http://t.co/whEreofYAx


## 2. Extracting features

### Features that can be extracted from the raw text

In [312]:
def extract_features(df): 
    # Create new column for text length
    df['text_length'] = df['text_raw'].apply(len)
    # Extract the number of hashtags
    df["hashtag_count"] = df["text_raw"].apply(lambda x: len([c for c in str(x) if c == "#"]))

    # Extract the number of mentions
    df["mention_count"] = df["text_raw"].apply(lambda x: len([c for c in str(x) if c == "@"]))

    # Extract the `has_url` feature
    df["has_url"] = df["text_raw"].apply(lambda x: 1 if "http" in str(x) else 0)
    return df

# Write the updated dataframe to a CSV file
df_train = extract_features(df_train)
df_test = extract_features(df_test)
df_train.head()

Unnamed: 0,target,text,keyword,text_raw,text_length,hashtag_count,mention_count,has_url
0,0,do not even remember slsp happening remember like wtf lights turned everyone screamed encore,screamed,i dont even remember slsp happening i just remember being like wtf and then the lights turned off and everyone screamed for the encore,134,0,0,0
1,0,hazelannmac ooh feel guilty wishing hatman bet mudslide delicious,mudslide,@hazelannmac ooh now I feel guilty about wishing hatman out. I bet the mudslide was delicious!,94,0,1,0
2,0,soultech collide club mix,collide,Soultech - Collide (Club Mix) http://t.co/8xIxBsPOT8,52,0,0,1
3,1,police officer wounded suspect dead exchanging shots,wounded,Police Officer Wounded Suspect Dead After Exchanging Shots - http://t.co/iPHaZV47g7,83,0,0,1
4,0,cramer igers 3 words wrecked disneys stock,wrecked,Cramer: Iger's 3 words that wrecked Disney's stock http://t.co/4dGpBAiVL7,73,0,0,1


### Ngrams

In [313]:
def create_ngrams(text, n):
    tokens = word_tokenize(text)
    n_grams = list(ngrams(tokens, n))
    return n_grams


df_train['bigrams'] = df_train['text'].apply(lambda x: create_ngrams(x, 2))
df_train['trigrams'] = df_train['text'].apply(lambda x: create_ngrams(x, 3))

df_test['bigrams'] = df_test['text'].apply(lambda x: create_ngrams(x, 2))
df_test['trigrams'] = df_test['text'].apply(lambda x: create_ngrams(x, 3))

In [314]:
def create_ngrams_string(ngram_list):
    ngram_words = ['_'.join(ngram) for ngram in ngram_list]
    ngram_string = ' '.join(ngram_words)
    return ngram_string

def add_ngrams(df):
    df['bigrams'] = df['bigrams'].apply(lambda x: create_ngrams_string(x))
    df['trigrams'] = df['trigrams'].apply(lambda x: create_ngrams_string(x))

    df['text_with_ngrams'] = df['text'] + ' ' +  df['bigrams'] + ' ' + df['trigrams'] 
    return df

df_train = add_ngrams(df_train)
df_test = add_ngrams(df_test)

### Word2Vec

In [315]:
import gensim
import pandas as pd

tokenized_text = df_train['text_with_ngrams'].apply(lambda x: x.split())

model_w2v = gensim.models.Word2Vec(
            tokenized_text,
            vector_size=400, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2, # Ignores all words with total frequency lower than 2.                                  
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 32, # no.of cores
            seed = 34
) 

model_w2v.train(tokenized_text, total_examples= len(df_train['text']), epochs=20)


(1313843, 3061160)

In [316]:
# Define a function that converts tokens to vectors using the Word2Vec model
def tokens_to_vectors(tokens, model, vector_size):
    """
    Convert a list of tokens to their corresponding vectors using a Word2Vec model.

    Args:
    - tokens (list of str): A list of tokens (words).
    - model (gensim.models.Word2Vec): The trained Word2Vec model.
    - vector_size (int): The size of the vectors.

    Returns:
    - list of np.ndarray: A list of vectors corresponding to the tokens.
    """
    vectors = np.zeros((len(tokens), vector_size))
    for i, token in enumerate(tokens):
        try:
            vectors[i] = model.wv[token]
        except KeyError:  # Token not in the model's vocabulary
            vectors[i] = np.zeros(vector_size)
    return vectors.mean(axis=0)

# Example usage
vecs_train = [tokens_to_vectors(tokens, model_w2v, 400) for tokens in tokenized_text]
vecs_train = np.vstack(vecs_train)

vecs_test = [tokens_to_vectors(tokens, model_w2v, 400) for tokens in df_test['text'].apply(lambda x: x.split())]
vecs_test = np.vstack(vecs_test)
# # Converting the list of vectors to a DataFrame
# vectors_df = pd.DataFrame(np.vstack(vecs))
# print(vectors_df.shape)

In [317]:
text_embedded_w2v_df = pd.DataFrame(vecs_train, columns=[str(x) for x in range(400)], index=df_train.index) 
text_embedded_w2f_df_test = pd.DataFrame(vecs_test, columns=[str(x) for x in range(400)], index=df_test.index)

## 3. Selecting features

In [318]:
# storing this for later we can test different features without having to re-run cells above this one
df_checkpoint = df_train.copy(deep=True)
df_test_checkpoint = df_test.copy(deep=True)

In [319]:
df_train.columns

Index(['target', 'text', 'keyword', 'text_raw', 'text_length', 'hashtag_count',
       'mention_count', 'has_url', 'bigrams', 'trigrams', 'text_with_ngrams'],
      dtype='object')

In [320]:
features_to_keep = ['target', 'text_length', 'hashtag_count', 'mention_count', 'has_url']

df_checkpoint = df_train[features_to_keep]
df_test_checkpoint = df_test[features_to_keep]


# Concatenate the dataframes with td-idf features for the text feature
# df_checkpoint = pd.concat([df_checkpoint, text_embedded_df, keyword_embedded_df], axis=1)
# df_test_checkpoint = pd.concat([df_test_checkpoint, text_embedded_test_df, keyword_embedded_test_df], axis=1)

df_checkpoint = pd.concat([df_checkpoint, text_embedded_w2v_df], axis=1)
df_test_checkpoint = pd.concat([df_test_checkpoint, text_embedded_w2f_df_test], axis=1)

df_checkpoint.dropna(inplace=True)
df_test_checkpoint.dropna(inplace=True)

# extract y_train and y_test here to avoid column name collision with 'target' feature coming from text and keyword embeddings
y_train = df_checkpoint['target']
y_test = df_test_checkpoint['target']

X_train = df_checkpoint.drop(['target'], axis=1)
X_test = df_test_checkpoint.drop(['target'], axis=1)

## 4. Modelling

In [321]:
def print_results(y_pred, y_train, y_pred_test, y_test):
    print("Train results")
    print("-----------------------------")
    print("Train accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print(classification_report(y_train, y_pred))
    print(confusion_matrix(y_train, y_pred))

    print()
    print("Test results")
    print("-----------------------------")
    print("Test accuracy: {}".format(accuracy_score(y_test, y_pred_test)))
    print(classification_report(y_test, y_pred_test))
    print(confusion_matrix(y_test, y_pred_test))


### 4.1 Logistic regression

In [322]:
logreg = LogisticRegression(random_state=42, solver="liblinear")
logreg.fit(X_train, y_train)

y_pred = cross_val_predict(logreg, X_train, y_train, cv=5)  # 5-fold cross-validation
y_pred_test = logreg.predict(X_test)

print_results(y_pred, y_train, y_pred_test, y_test)

Train results
-----------------------------
Train accuracy: 0.8414448024419197
              precision    recall  f1-score   support

           0       0.84      0.92      0.88      3595
           1       0.85      0.72      0.78      2302

    accuracy                           0.84      5897
   macro avg       0.84      0.82      0.83      5897
weighted avg       0.84      0.84      0.84      5897

[[3315  280]
 [ 655 1647]]

Test results
-----------------------------
Test accuracy: 0.7591911764705882
              precision    recall  f1-score   support

           0       0.81      0.75      0.78      1219
           1       0.71      0.78      0.74       957

    accuracy                           0.76      2176
   macro avg       0.76      0.76      0.76      2176
weighted avg       0.76      0.76      0.76      2176

[[910 309]
 [215 742]]


### 4.1.2. Support Vector Machines

In [323]:
first_n = 1000

# Initialize SVM model
svm_model = SVC(kernel='linear', C=1, random_state=42, probability=False)

# Fit the model on training data
svm_model.fit(X_train, y_train)

# Use 5-fold cross-validation to get predictions on training set
y_pred_train = cross_val_predict(svm_model, X_train, y_train, cv=5)
y_pred_test = svm_model.predict(X_test)

print_results(y_pred_train, y_train, y_pred_test, y_test)

Train results
-----------------------------
Train accuracy: 0.8414448024419197
              precision    recall  f1-score   support

           0       0.83      0.93      0.88      3595
           1       0.86      0.70      0.78      2302

    accuracy                           0.84      5897
   macro avg       0.85      0.82      0.83      5897
weighted avg       0.84      0.84      0.84      5897

[[3342  253]
 [ 682 1620]]

Test results
-----------------------------
Test accuracy: 0.7545955882352942
              precision    recall  f1-score   support

           0       0.81      0.74      0.77      1219
           1       0.70      0.78      0.74       957

    accuracy                           0.75      2176
   macro avg       0.75      0.76      0.75      2176
weighted avg       0.76      0.75      0.76      2176

[[899 320]
 [214 743]]


In [324]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

#Creating an XGBoost classifier
model = xgb.XGBClassifier()

#Training the model on the training data
model.fit(X_train, y_train)

#Making predictions on the test set
predictions = model.predict(X_test)

#Calculating accuracy
accuracy = accuracy_score(y_test, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.7637867647058824

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.79      1219
           1       0.75      0.70      0.72       957

    accuracy                           0.76      2176
   macro avg       0.76      0.76      0.76      2176
weighted avg       0.76      0.76      0.76      2176



In [325]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.7449448529411765

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.82      0.78      1219
           1       0.74      0.64      0.69       957

    accuracy                           0.74      2176
   macro avg       0.74      0.73      0.74      2176
weighted avg       0.74      0.74      0.74      2176

