# Import data

**Run this cell to transform raw file from bb to file without the non-utf8 characters �**

Removes � characters from file

In [18]:
# Open the file in read mode, read its contents, then close it
with open('../data/disaster-tweets.csv', 'r', encoding='utf-8', errors='ignore') as file:
    content = file.read()

# Remove all � characters
content = content.replace('�', '')

# Open the file in write mode and write the modified content back to it
with open('../data/disaster-tweets.csv', 'w', encoding='utf-8') as file:
    file.write(content)

In [19]:
import pandas as pd

data_path = '../data/disaster-tweets.csv'
# read csv, ","-separated

df = pd.read_csv(data_path, sep=',')

# print dimensions
print("Shape of raw data", df.shape)

# print first 2 rows
df.head(2)

Shape of raw data (10876, 13)


Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778243823,True,golden,156,,Relevant,1.0,Relevant,,,Just happened a terrible car crash,1.0,
1,778243824,True,golden,152,,Relevant,1.0,Relevant,,,Our Deeds are the Reason of this #earthquake M...,13.0,


In [20]:
# save text, id to csv
df[['text', 'tweetid']].to_csv('results/texts.csv', index=False, sep=';')

# save all relevant rows to csv, i.e. choose_one = Relevant
df[df['choose_one'] == 'Relevant'][['text', 'tweetid', 'location']].to_csv('results/relevant.csv', index=False, sep=';')

# Explore data

In [21]:
df.describe()

# missing values
df.isnull().sum()

_unit_id                     0
_golden                      0
_unit_state                  0
_trusted_judgments           0
_last_judgment_at           84
choose_one                   0
choose_one:confidence        0
choose_one_gold          10789
keyword                     87
location                  3638
text                         0
tweetid                      0
userid                      87
dtype: int64

### Result

# Preprocess data

### Select features

In [22]:
raw_features = ['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'choose_one', 'choose_one:confidence',
       'choose_one_gold', 'keyword', 'location', 'text', 'tweetid', 'userid']

features_to_keep = [ 'choose_one', 'keyword', 'location', 'text' ]

df = df[features_to_keep]

### Choose_one

In [23]:
# drop rows with choose_one = 'Can't Decide'
df = df[df['choose_one'] != "Can't Decide"]

# if choose one is relevant, set to 1, else 0
df['choose_one'] = df['choose_one'].apply(lambda x: 1 if x == 'Relevant' else 0)

df['y'] = df['choose_one']
df.drop(columns=['choose_one'], inplace=True, errors='ignore')

df.head()

Unnamed: 0,keyword,location,text,y
0,,,Just happened a terrible car crash,1
1,,,Our Deeds are the Reason of this #earthquake M...,1
2,,,"Heard about #earthquake is different cities, s...",1
3,,,"there is a forest fire at spot pond, geese are...",1
4,,,Forest fire near La Ronge Sask. Canada,1


### Text processing


In [24]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
import re

tokenizer = TweetTokenizer(
    preserve_case=False,
    strip_handles=True,
    reduce_len=True,
)

Stopwords = set(stopwords.words('english'))

def remove_noise(doc):
    filter_words = []

    doc = doc.replace('�', '')

    # # Remove URLs
    doc = re.sub(r'http\S+', '', doc)
    doc = re.sub(r'www\.\S+', '', doc)

    tokenized = tokenizer.tokenize(doc)

    for word in tokenized:
        if (word not in Stopwords 
            and word not in string.punctuation 
            ):

            filter_words.append(word)

    return " ".join(filter_words)
    

df['text'] = df['text'].apply(remove_noise)

print("Dimensions after preprocessing", df.shape)
df.head()

Dimensions after preprocessing (10860, 4)


Unnamed: 0,keyword,location,text,y
0,,,happened terrible car crash,1
1,,,deeds reason #earthquake may allah forgive us,1
2,,,heard #earthquake different cities stay safe e...,1
3,,,forest fire spot pond geese fleeing across str...,1
4,,,forest fire near la ronge sask canada,1


In [25]:
# save to csv where y=1
df[df['y'] == 1][['text', 'location']].to_csv('results/preprocessed_relevant.csv', index=False, sep=';')


## Bag of words

In [26]:
# Implement bag of words

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

vectorizer = ()


bow = vectorizer.fit_transform(df['text'])

print("Shape of X", bow.shape)

# split into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(bow, df['y'], test_size=0.2, random_state=42)

print("Shape of X_train", X_train.shape)
print("Shape of X_test", X_test.shape)
print("Shape of y_train", y_train.shape)
print("Shape of y_test", y_test.shape)

Shape of X (10860, 18024)
Shape of X_train (8688, 18024)
Shape of X_test (2172, 18024)
Shape of y_train (8688,)
Shape of y_test (2172,)


# Logistic regression model

In [27]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42, solver='lbfgs').fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)

# get probabilities for class 1
y_pred = y_pred[:, 1]

print(y_pred.tolist())

[0.09512154150929969, 0.9174830148771023, 0.034592261619393376, 0.1832384048043466, 0.16016317671785343, 0.08205806513556987, 0.2502005812552029, 0.9915920636431234, 0.11742961259347087, 0.3381634976722608, 0.111458175202983, 0.11303271693311948, 0.09266355768349155, 0.8624843805109469, 0.2845133852347086, 0.9666745177681888, 0.9337601565023214, 0.988084211792461, 0.3869023678247709, 0.3299219544608364, 0.24117203186475478, 0.0991378264820006, 0.11392030797653213, 0.24412866943358866, 0.016020917164048715, 0.3544863579138189, 0.5907184105580041, 0.9462078755296255, 0.7174393625762133, 0.42090796528194074, 0.1551246718060753, 0.3602327446375198, 0.4014099240112793, 0.0817096004124465, 0.09568758284383028, 0.8813638108269292, 0.15895572878214106, 0.9764414250822079, 0.21124188768294655, 0.06454249946737636, 0.07933594188369539, 0.9677849821135143, 0.9977167849266538, 0.999409388733008, 0.20682962019796286, 0.876938475472164, 0.21382082930540622, 0.23743616368461126, 0.14309347382172233, 

### Performance metrics

In [28]:
from sklearn.metrics import accuracy_score, precision_score

clipped_y_pred = np.array([1 if x >= 0.5 else 0 for x in y_pred])

accuracy = accuracy_score(y_test, clipped_y_pred)
print("Accuracy on the test set:", accuracy)

precision = precision_score(y_test, clipped_y_pred, pos_label=1)

print("Precision on the test set:", precision)


Accuracy on the test set: 0.7914364640883977
Precision on the test set: 0.8064133016627079


### Exploring results

In [29]:
# map weights to vocabulary
weights = clf.coef_[0]
feature_names = vectorizer.get_feature_names_out()

feature_weights = pd.DataFrame({'feature': feature_names, 'weight': weights})

feature_weights.sort_values(by='weight', ascending=False, inplace=True)

feature_weights.head(20)

Unnamed: 0,feature,weight
7641,hiroshima,3.009371
6261,fires,2.340247
17543,wildfire,2.201304
16634,typhoon,2.141896
5301,earthquake,2.117448
4639,derailment,2.074163
7274,hailstorm,1.922779
15251,storm,1.90276
2448,bombing,1.896262
12903,rainstorm,1.888422


In [30]:
import numpy as np

m = df.shape[0]

df_test = df.iloc[int(0.8*m):]

df_test['y_pred'] = y_pred

incorrectly_classified_indices = np.where(y_pred != y_test)[0]

# select rows df incorrectly_classified_indices
df_test.iloc[incorrectly_classified_indices].to_csv('results/incorrectly_classified_rows.csv', index=False)

# false negatives
false_negatives = np.where((y_pred == 0) & (y_test == 1))[0]

df_test.iloc[false_negatives].to_csv('results/false_negatives.csv', index=False)

# false positives

false_positives = np.where((y_pred == 1) & (y_test == 0))[0]

df_test.iloc[false_positives].to_csv('results/false_positives.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['y_pred'] = y_pred


# Naive Bayes model

In [31]:
from sklearn.linear_model import Naiv

clf = LogisticRegression(random_state=42, solver='lbfgs').fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)

# get probabilities for class 1
y_pred = y_pred[:, 1]

print(y_pred.tolist())

ImportError: cannot import name 'Naiv' from 'sklearn.linear_model' (/opt/homebrew/anaconda3/envs/tdt-4173-ml-env/lib/python3.11/site-packages/sklearn/linear_model/__init__.py)