In [None]:
# Use GPU
import os
import requests
import json

import pandas as pd
import numpy as np
import torch
import seaborn as sns

from transformers import AutoTokenizer

from tqdm.notebook import tqdm

from IPython.display import display, clear_output
import time  # Just for simulating computation tim

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from nltk.tokenize import TweetTokenizer

In [None]:
os.environ['HSA_OVERRIDE_GFX_VERSION'] = '10.3.0'

In [None]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [None]:
display(df_train.info())
display(df_train.head(3))

### Clean tweet columns

In [None]:
# change target column to something more informative
df_train['target'] = df_train['target'].map({1: True, 0: False})
df_train = df_train.rename(columns={'target': 'Disaster'})

df_test = df_test.rename(columns={'target': 'Disaster'})

df_train = df_train.rename(columns={'text': 'Tweet'})
df_test = df_test.rename(columns={'text': 'Tweet'})

df_train = df_train.rename(columns={'location': 'TweetLocation'})
df_test = df_test.rename(columns={'location': 'TweetLocation'})

df_train = df_train.rename(columns={'keyword': 'TweetKeywords'})
df_test = df_test.rename(columns={'keyword': 'TweetKeywords'})


# drop the id column
df_train = df_train.drop(columns=['id'])

df_train.head(3)

In [None]:
display("Missing values")
display(df_train.isna().sum())

**Observation:**
- We can see that locations and keywords are not present for many obvious tweets. Lots of cleaning could fix this.

### Baseline-run
- Before we try in-depth cleaning and processing, lets see what kind of performance we can get using simply the existing text column

In [None]:
def tokenizer(text):
    tok = TweetTokenizer()
    return tok.tokenize(text)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words='english', tokenizer=tokenizer)
X = vectorizer.fit_transform(df_train['text'])
y = df_train['target']

In [None]:
parameters = {
    'n_estimators': [30, 100, 200, 300],
    'max_depth': [3, 5, 7]
}

gridcv = GridSearchCV(LogisticRegression(), n_jobs=-1, param_grid={}, cv=(StratifiedKFold(n_splits=5,shuffle=True)))
gridcv.fit(X,y)
print(gridcv.best_score_)

We see with logistic regression, we get a baseline accuracy of 78% without any cleaning and only using the text column.

### Engineer basic features
- Lets try to engineer some new features

#### Create new feature 'ContainsMention'

In [None]:
df_train['ContainsMention'] = df_train['Tweet'].str.contains('@')
df_test['ContainsMention'] = df_test['Tweet'].str.contains('@')

In [None]:
df_train[df_train['ContainsMention'] == True].sample(2)

#### Create new feature 'ContainsUrl'

In [None]:
df_train['ContainsUrl'] = df_train['Tweet'].str.contains(r'http|https|www\.|t\.co', regex=True)
df_test['ContainsUrl'] = df_test['Tweet'].str.contains(r'http|https|www\.|t\.co', regex=True)

In [None]:
df_train[df_train['ContainsUrl'] == True].sample(2)

In [None]:
sns.catplot(hue='ContainsUrl',y='Disaster',data=df_train,kind='bar')

### Data cleaning using LLMs
As the data stands currently, its very dirty. The keyword and location columns are inconsistently formatted, as well as missing data for their respective text field. Manually going through the data and extracting locations and keywords is infeasible. 

**I'm going to try and extract keywords and locations using existing NLP models.** That is, I'm going to use LLM's to clean the data and see the performance increase I get purely on better data alone.

I could arguably use an LLM to do the tweet classification for a disaster as well, but thats too close to cheating IMO. Lets treat this as an excercise first in using LLMs to help clean data, and see how much performance can increase simply by having good quality data!

### Model: LLama8b-instruct

In [246]:
def query_model(context,data,memory=None):
    # Local llama.cpp inference engine server
    url = "http://127.0.0.1:5000/v1/chat/completions"
    headers = {
        "Content-Type": "application/json"
    }
    history = []
    if memory:
        history += memory
    history += [{"role": "user", "content": data},{"role": "user", "content": data}]
    payload = {
        "mode": "chat",
        "messages": history,
        "instruct_template": "Llama",
        "max_tokens": 8,
        "temperature": 0.2,
        "top_p": 0.1,
        "context": context,
        "name1": "Tweet",
    }
    response = requests.post(url, headers=headers, json=payload, verify=False)
    return response.json()['choices'][0]['message']['content']

In [247]:
system_prompts = {}

#### Create new feature 'RealTweetLocation'

In [248]:
pd.set_option('display.max_rows',None)
display(df_train[(df_train['TweetLocation'].notna())][['TweetLocation','Tweet']])

Unnamed: 0,TweetLocation,Tweet
31,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...
32,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...
33,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...
34,"Philadelphia, PA",Crying out for more! Set me ablaze
35,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...
36,Pretoria,@PhDSquares #mufc they've built so much hype a...
37,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...
39,Paranaque City,Ablaze for you Lord :D
40,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...
42,milky way,Had an awesome time visiting the CFC head offi...


In [270]:
# create memory to tune responses
def get_formatted_sample_real_location(df,idx,decision):
    location = df.loc[idx]['TweetLocation']
    return {
        'role': 'user',
        'content': location
    }, {
        'role': 'assistant',
        'content': str(decision)
    }

location_memory = json.load(open('./data/realtweetlocation_memory.json'))

good_locations = [31, 34, 46, 49, 51, 94, 115, 136, 195, 423, 657, 669, 1023, 1110, 1368, 1597, 1824, 2471]
# 31	Birmingham
# 34	Philadelphia, PA	
# 46	GREENSBORO,NORTH CAROLINA
# 49	England.	
# 51	India	
# 94	Alberta | Sask. | Montana	
# 115 US	
# 136 19.600858, -99.047821
# 195 || c h i c a g o ||
# 423 NYC :) Ex- #Islamophobe
# 657 Rio de Janeiro
# 669 Karachi
# 1023 WESTSIDE OF PHILLY 7? BLOCK??	
# 1110 The land of New Jersey.	
# 1368 Mid north coast of NSW	
# 1597 Highland Park, CA	
# 1824 Weston super mare	
# 2471 Minneapolis,MN,US	


bad_locations = [42, 81, 102, 209, 313, 321, 326, 356, 450, 503, 544, 675, 937, 996, 1169,  1344, 1574, 1807, 2052, 2344, 5937, 5993, 6288, 7183,7460]
# 42	milky way
# 81	Your Sister's Bedroom
# 102	Instagram - @heyimginog
# 209 Higher Places
# 313	probably the strip club	
# 321	Here And There	
# 326	Worldwide	
# 356 studio
# 450 Selena | Britney | Hilary	
# 503 #GDJB #ASOT
# 544 World
# 675 Gotham City
# 937	Nowhere Islands/Smash Manor	
# 996 PURPLE BOOTH STUDIOã¢	
# 1169 PROUD INDIANS
# 1344 Whiterun, Skyrim
# 1574 BestCoast
# 1807 In my own world!!!	
# 2052 2005 |-/	
# 2344 ??????	
# 5937 m3, k, a, d	
# $ 5993 Freddy Fazbears pizzeria	
# 6288 Desert Storm?? |BCHS|	
# 7183 åÊ(?Û¢`?Û¢å«)??	
# 7460 ?Gangsta OC / MV RP; 18+.?	

good_idx, bad_idx = 0, 0
while good_idx < len(good_locations) and bad_idx < len(bad_locations):
    if good_idx < len(good_locations):
        user, ai = get_formatted_sample_real_location(df_train,int(good_locations[good_idx]),'True')
        location_memory.append(user)
        location_memory.append(ai)
        good_idx += 1
        
    if bad_idx < len(bad_locations):
        user, ai = get_formatted_sample_real_location(df_train,int(bad_locations[bad_idx]),'False')
        location_memory.append(user)
        location_memory.append(ai)
        bad_idx += 1
    
with open('./data/realtweetlocation_memory.json', 'w') as f:
    f.write(json.dumps(location_memory))

In [271]:
system_prompts['RealTweetLocation'] = """
The following is a conversation with an AI Large Language Model. The AI has been trained to answer questions, provide recommendations, and help with decision making. The AI follows user requests. The AI thinks outside the box.

The AI's responses are succinct, and provides no additional information, insights or details in its responses.

You will be provided with a 'Location' from which a tweet was supposedly sent. Your task is to determine if the location refers to a real, existing place, rather than a humorous or fictional invention. Respond with 'True' if the location is a genuine, existing place, and 'False' if it is not.

Your response is either: True or False with NOTHING else.
"""

realtweetlocation_memory = json.load(open('./data/realtweetlocation_memory.json'))
print('memory size:',len(realtweetlocation_memory))
print('memory length:',len(str(realtweetlocation_memory)))

realtweetlocation_cache = json.load(open('./data/realtweetlocation_cache.json'))
print('cache size:',len(realtweetlocation_cache))

memory size: 72
memory length: 3299
cache size: 0


In [273]:
last_size = len(realtweetlocation_cache)
for idx, tweet_location in df_train['TweetLocation'].dropna().items():
    clear_output(wait=False)
    if tweet_location in realtweetlocation_cache:
        is_real_location = realtweetlocation_cache[tweet_location]
        df_train.loc[idx, 'RealTweetLocation'] = is_real_location
    else:
        is_real_location = query_model(system_prompts['RealTweetLocation'],tweet_location,realtweetlocation_memory)
        df_train.loc[idx, 'RealTweetLocation'] = is_real_location
        realtweetlocation_cache[tweet_location] = is_real_location
        display(idx,tweet_location,is_real_location)
    
    current_size = len(realtweetlocation_cache)
    if current_size - last_size >= 50:
        last_size = current_size
        print('writing to cache...')
        with open('./data/realtweetlocation_cache.json', 'w') as f:
            f.write(json.dumps(realtweetlocation_cache))

print('writing finalized realtweetlocation_cache.json ...')
with open('./data/realtweetlocation_cache.json', 'w') as f:
    f.write(json.dumps(realtweetlocation_cache))
    
print('writing updated train.csv file...')
df_train.to_csv('./data/train_v1.csv',index=False)

7581

'Lincoln'

'True'

writing finalized realtweetlocation_cache.json ...
writing updated train.csv file...


In [274]:
for idx, tweet_location in df_test['TweetLocation'].dropna().items():
    clear_output(wait=False)
    if tweet_location in realtweetlocation_cache:
        is_real_location = realtweetlocation_cache[tweet_location]
        df_test.loc[idx, 'RealTweetLocation'] = is_real_location
        display(idx,tweet_location,'cache hit:',is_real_location)
    else:
        is_real_location = query_model(system_prompts['RealTweetLocation'],tweet_location,realtweetlocation_memory)
        df_test.loc[idx, 'RealTweetLocation'] = is_real_location
        realtweetlocation_cache[tweet_location] = is_real_location
        display(idx,tweet_location,is_real_location)
    
    current_size = len(realtweetlocation_cache)
    if current_size - last_size >= 50:
        last_size = current_size
        print('writing to cache...')
        with open('./data/realtweetlocation_cache.json', 'w') as f:
            f.write(json.dumps(realtweetlocation_cache))

print('writing finalized realtweetlocation_cache.json ...')
with open('./data/realtweetlocation_cache.json', 'w') as f:
    f.write(json.dumps(realtweetlocation_cache))
    
print('writing updated teset.csv file...')
df_test.to_csv('./data/test_v1.csv',index=False)

3250

'Brussels, Belgium'

'True'

writing finalized realtweetlocation_cache.json ...
writing updated teset.csv file...


In [284]:
system_prompts['CleanTweetLocation'] = """
The following is a conversation with an AI Large Language Model. The AI has been trained to answer questions, provide recommendations, and help with decision making. The AI follows user requests. The AI thinks outside the box.

The AI's responses are succinct, and provides no additional information, insights or details in its responses.

You will be provided with a 'Location' string, representing the supposed place from which a tweet was sent, and the 'Text' of that tweet. Your task is to clean and standardize the location data into a consistent format. Use the tweet's content to help clarify or confirm the location details if the location provided is ambiguous. The standardized format should include correct naming for cities, states, and countries, correcting any spelling errors, and normalizing variations such as abbreviations. Your output should be the cleaned and standardized location string."""

cleantweetlocation_memory = json.load(open('./data/cleantweetlocation_memory.json'))
print('memory size:',len(cleantweetlocation_memory))
print('memory length:',len(str(cleantweetlocation_memory)))

cleantweetlocation_cache = json.load(open('./data/cleantweetlocation_cache.json'))
print('cache size:',len(cleantweetlocation_cache))

memory size: 14
memory length: 1116
cache size: 0


In [288]:
last_cleantweetlocation_cache_size = len(cleantweetlocation_cache)

train_iter_df = df_train[(df_train['TweetLocation'].notna()) & (df_train['RealTweetLocation'] == 'True')]
for idx, series in train_iter_df.iterrows():
    text = series['Tweet']
    location = series['TweetLocation']
    
    tweet = f"Text: {text}\nLocation: {location}"
    
    if location in cleantweetlocation_cache:
        df_train.loc[idx, 'CleanTweetLocation'] = cleantweetlocation_cache[location]
    else:
        clear_output(wait=True)
        data = f"Text: {text}\nLocation: {location}"
        clean_location = query_model(system_prompts['CleanTweetLocation'],tweet,cleantweetlocation_memory)
        df_train.loc[idx, 'CleanTweetLocation'] = clean_location
        cleantweetlocation_cache[location] = clean_location
        display(idx,tweet,clean_location)
        
    current_cleantweetlocation_cache_size = len(cleantweetlocation_cache)
    if current_cleantweetlocation_cache_size - last_cleantweetlocation_cache_size >= 25:
        last_cleantweetlocation_cache_size = current_cleantweetlocation_cache_size
        print('writing to cache...')
        with open('./data/cleantweetlocation_cache.json', 'w') as f:
            f.write(json.dumps(cleantweetlocation_cache))
        

print('writing finalized cleantweetlocation_cache.json ...')
with open('./data/cleantweetlocation_cache.json', 'w') as f:
    f.write(json.dumps(cleantweetlocation_cache))
    
print('writing updated train.csv file...')
df_train.to_csv('./data/train_v2.csv',index=False)

7581

'Text: @engineshed Great atmosphere at the British Lion gig tonight. Hearing is wrecked. http://t.co/oMNBAtJEAO\nLocation: Lincoln'

'Lincoln, UK'

writing finalized cleantweetlocation_cache.json ...
writing updated train.csv file...


In [293]:
last_cleantweetlocation_cache_size = len(cleantweetlocation_cache)

test_iter_df = df_test[(df_test['TweetLocation'].notna()) & (df_test['RealTweetLocation'] == 'True')]
for idx, series in test_iter_df.iterrows():
    text = series['Tweet']
    location = series['TweetLocation']
    
    tweet = f"Text: {text}\nLocation: {location}"
    
    if location in cleantweetlocation_cache:
        df_test.loc[idx, 'CleanTweetLocation'] = cleantweetlocation_cache[location]
    else:
        clear_output(wait=True)
        data = f"Text: {text}\nLocation: {location}"
        clean_location = query_model(system_prompts['CleanTweetLocation'],tweet,cleantweetlocation_memory)
        df_test.loc[idx, 'CleanTweetLocation'] = clean_location
        cleantweetlocation_cache[location] = clean_location
        display(idx,data,clean_location)
        
    current_cleantweetlocation_cache_size = len(cleantweetlocation_cache)
    if current_cleantweetlocation_cache_size - last_cleantweetlocation_cache_size >= 25:
        last_cleantweetlocation_cache_size = current_cleantweetlocation_cache_size
        print('writing to cache...')
        with open('./data/cleantweetlocation_cache.json', 'w') as f:
            f.write(json.dumps(cleantweetlocation_cache))
        

print('writing finalized cleantweetlocation_cache.json ...')
with open('./data/cleantweetlocation_cache.json', 'w') as f:
    f.write(json.dumps(cleantweetlocation_cache))
    
print('writing updated test.csv file...')
df_test.to_csv('./data/test_v2.csv',index=False)

3250

'Text: @stighefootball Begovic has been garbage. He got wrecked by a Red Bull reserve team and everyone else this preseason\nLocation: Brussels, Belgium'

'Brussels, Belgium'

writing finalized cleantweetlocation_cache.json ...
writing updated test.csv file...
